12 changes: 4 additions & 8 deletions llvm/include/llvm/TableGen/Record.h
Original file line number Diff line number Diff line change
Expand Up @@ -401,9 +401,7 @@ class Init {
/// variables which may not be defined at the time the expression is formed.
/// If a value is set for the variable later, this method will be called on
/// users of the value to allow the value to propagate out.
virtual const Init *resolveReferences(Resolver &R) const {
return const_cast<Init *>(this);
}
virtual const Init *resolveReferences(Resolver &R) const { return this; }

/// Get the \p Init value of the specified bit.
virtual const Init *getBit(unsigned Bit) const = 0;
Expand Down Expand Up @@ -475,9 +473,7 @@ class UnsetInit : public Init {
const Init *getCastTo(const RecTy *Ty) const override;
const Init *convertInitializerTo(const RecTy *Ty) const override;

const Init *getBit(unsigned Bit) const override {
return const_cast<UnsetInit*>(this);
}
const Init *getBit(unsigned Bit) const override { return this; }

/// Is this a complete value with no unset (uninitialized) subvalues?
bool isComplete() const override { return false; }
Expand Down Expand Up @@ -579,7 +575,7 @@ class BitInit final : public TypedInit {

const Init *getBit(unsigned Bit) const override {
assert(Bit < 1 && "Bit index out of range!");
return const_cast<BitInit*>(this);
return this;
}

bool isConcrete() const override { return true; }
Expand Down Expand Up @@ -1318,7 +1314,7 @@ class VarBitInit final : public TypedInit {

const Init *getBit(unsigned B) const override {
assert(B < 1 && "Bit index out of range!");
return const_cast<VarBitInit*>(this);
return this;
}
};

Expand Down
8 changes: 5 additions & 3 deletions llvm/lib/CodeGen/CalcSpillWeights.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,10 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
// localLI = COPY other
// ...
// other = COPY localLI
TotalWeight += LiveIntervals::getSpillWeight(true, false, &MBFI, LocalMBB);
TotalWeight += LiveIntervals::getSpillWeight(false, true, &MBFI, LocalMBB);
TotalWeight +=
LiveIntervals::getSpillWeight(true, false, &MBFI, LocalMBB, PSI);
TotalWeight +=
LiveIntervals::getSpillWeight(false, true, &MBFI, LocalMBB, PSI);

NumInstr += 2;
}
Expand Down Expand Up @@ -272,7 +274,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
// Calculate instr weight.
bool Reads, Writes;
std::tie(Reads, Writes) = MI->readsWritesVirtualRegister(LI.reg());
Weight = LiveIntervals::getSpillWeight(Writes, Reads, &MBFI, *MI);
Weight = LiveIntervals::getSpillWeight(Writes, Reads, &MBFI, *MI, PSI);

// Give extra weight to what looks like a loop induction variable update.
if (Writes && IsExiting && LIS.isLiveOutOfMBB(LI, MBB))
Expand Down
19 changes: 15 additions & 4 deletions llvm/lib/CodeGen/LiveIntervals.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,15 @@
#include "llvm/CodeGen/MachineInstrBundle.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineSizeOpts.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/SlotIndexes.h"
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/ProfileSummary.h"
#include "llvm/IR/Statepoint.h"
#include "llvm/MC/LaneBitmask.h"
#include "llvm/MC/MCRegisterInfo.h"
Expand Down Expand Up @@ -875,14 +877,23 @@ LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const {

float LiveIntervals::getSpillWeight(bool isDef, bool isUse,
const MachineBlockFrequencyInfo *MBFI,
const MachineInstr &MI) {
return getSpillWeight(isDef, isUse, MBFI, MI.getParent());
const MachineInstr &MI,
ProfileSummaryInfo *PSI) {
return getSpillWeight(isDef, isUse, MBFI, MI.getParent(), PSI);
}

float LiveIntervals::getSpillWeight(bool isDef, bool isUse,
const MachineBlockFrequencyInfo *MBFI,
const MachineBasicBlock *MBB) {
return (isDef + isUse) * MBFI->getBlockFreqRelativeToEntryBlock(MBB);
const MachineBasicBlock *MBB,
ProfileSummaryInfo *PSI) {
float Weight = isDef + isUse;
const auto *MF = MBB->getParent();
// When optimizing for size we only consider the codesize impact of spilling
// the register, not the runtime impact.
if (PSI && (MF->getFunction().hasOptSize() ||
llvm::shouldOptimizeForSize(MF, PSI, MBFI)))
return Weight;
return Weight * MBFI->getBlockFreqRelativeToEntryBlock(MBB);
}

LiveRange::Segment
Expand Down
6 changes: 5 additions & 1 deletion llvm/lib/CodeGen/RegAllocBasic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "AllocationOrder.h"
#include "RegAllocBase.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/CodeGen/CalcSpillWeights.h"
#include "llvm/CodeGen/LiveDebugVariables.h"
#include "llvm/CodeGen/LiveIntervals.h"
Expand Down Expand Up @@ -140,6 +141,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_END(RABasic, "regallocbasic", "Basic Register Allocator", false,
false)

Expand Down Expand Up @@ -182,6 +184,7 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addPreserved<LiveDebugVariables>();
AU.addRequired<LiveStacks>();
AU.addPreserved<LiveStacks>();
AU.addRequired<ProfileSummaryInfoWrapperPass>();
AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
AU.addPreserved<MachineBlockFrequencyInfoWrapperPass>();
AU.addRequiredID(MachineDominatorsID);
Expand Down Expand Up @@ -312,7 +315,8 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) {
getAnalysis<LiveRegMatrix>());
VirtRegAuxInfo VRAI(
*MF, *LIS, *VRM, getAnalysis<MachineLoopInfoWrapperPass>().getLI(),
getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI());
getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI(),
&getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
VRAI.calculateSpillWeightsAndHints();

SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM, VRAI));
Expand Down
328 changes: 163 additions & 165 deletions llvm/lib/TableGen/Record.cpp

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion llvm/lib/TableGen/SetTheory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ void SetTheory::evaluate(const Init *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
const auto *DagExpr = dyn_cast<DagInit>(Expr);
if (!DagExpr)
PrintFatalError(Loc, "Invalid set element: " + Expr->getAsString());
const DefInit *OpInit = dyn_cast<DefInit>(DagExpr->getOperator());
const auto *OpInit = dyn_cast<DefInit>(DagExpr->getOperator());
if (!OpInit)
PrintFatalError(Loc, "Bad set expression: " + Expr->getAsString());
auto I = Operators.find(OpInit->getDef()->getName());
Expand Down
150 changes: 75 additions & 75 deletions llvm/lib/TableGen/TGParser.cpp

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5971,6 +5971,29 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1).getValueType(),
Op.getOperand(1), Op.getOperand(2)));
return SDValue();
case Intrinsic::aarch64_neon_sqrshrn:
if (Op.getValueType().isVector())
return DAG.getNode(
ISD::TRUNCATE_SSAT_S, dl, Op.getValueType(),
DAG.getNode(
AArch64ISD::SRSHR_I, dl, Op.getOperand(1).getValueType(),
Op.getOperand(1), Op.getOperand(2)));
return SDValue();
case Intrinsic::aarch64_neon_sqrshrun:
if (Op.getValueType().isVector())
return DAG.getNode(
ISD::TRUNCATE_SSAT_U, dl, Op.getValueType(),
DAG.getNode(
AArch64ISD::SRSHR_I, dl, Op.getOperand(1).getValueType(),
Op.getOperand(1), Op.getOperand(2)));
return SDValue();
case Intrinsic::aarch64_neon_uqrshrn:
if (Op.getValueType().isVector())
return DAG.getNode(
ISD::TRUNCATE_USAT_U, dl, Op.getValueType(),
DAG.getNode(
AArch64ISD::URSHR_I, dl, Op.getOperand(1).getValueType(), Op.getOperand(1), Op.getOperand(2)));
return SDValue();
case Intrinsic::aarch64_sve_whilelo:
return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
/*IsEqual=*/false);
Expand Down
12 changes: 6 additions & 6 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -8001,15 +8001,15 @@ def : Pat<(v1i64 (AArch64vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
(i32 vecshiftL64:$imm))),
(SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
int_aarch64_neon_sqrshrn>;
BinOpFrag<(truncssat_s (AArch64srshri node:$LHS, node:$RHS))>>;
defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
int_aarch64_neon_sqrshrun>;
BinOpFrag<(truncssat_u (AArch64srshri node:$LHS, node:$RHS))>>;
defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
defm SQSHL : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
BinOpFrag<(truncssat_s (AArch64vashr node:$LHS, node:$RHS))>>;
BinOpFrag<(truncssat_s (AArch64vashr node:$LHS, node:$RHS))>>;
defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
BinOpFrag<(truncssat_u (AArch64vashr node:$LHS, node:$RHS))>>;
BinOpFrag<(truncssat_u (AArch64vashr node:$LHS, node:$RHS))>>;
defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", AArch64vsri>;
def : Pat<(v1i64 (AArch64vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
(i32 vecshiftR64:$imm))),
Expand All @@ -8027,10 +8027,10 @@ defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
defm UCVTF : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf",
int_aarch64_neon_vcvtfxu2fp>;
defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
int_aarch64_neon_uqrshrn>;
BinOpFrag<(truncusat_u (AArch64urshri node:$LHS, node:$RHS))>>;
defm UQSHL : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
defm UQSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
BinOpFrag<(truncusat_u (AArch64vlshr node:$LHS, node:$RHS))>>;
BinOpFrag<(truncusat_u (AArch64vlshr node:$LHS, node:$RHS))>>;
defm URSHR : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>;
defm URSRA : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
TriOpFrag<(add node:$LHS,
Expand Down
523 changes: 341 additions & 182 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Large diffs are not rendered by default.

168 changes: 168 additions & 0 deletions llvm/test/CodeGen/AArch64/regalloc-spill-weight-basic.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py

; RUN: llc < %s -mtriple=aarch64 -regalloc=basic | FileCheck %s

; Test that the register allocator behaves differently with minsize functions.

declare void @foo(i32, ptr)

define void @optsize(i32 %arg, i32 %arg1, ptr %arg2, ptr %arg3, ptr %arg4, i32 %arg5, i1 %arg6) minsize {
; CHECK-LABEL: optsize:
; CHECK: // %bb.0: // %bb
; CHECK-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: .cfi_offset w19, -8
; CHECK-NEXT: .cfi_offset w20, -16
; CHECK-NEXT: .cfi_offset w21, -24
; CHECK-NEXT: .cfi_offset w22, -32
; CHECK-NEXT: .cfi_offset w23, -40
; CHECK-NEXT: .cfi_offset w30, -48
; CHECK-NEXT: mov w23, w5
; CHECK-NEXT: mov x22, x4
; CHECK-NEXT: mov x21, x3
; CHECK-NEXT: mov x20, x2
; CHECK-NEXT: mov w19, w1
; CHECK-NEXT: .LBB0_1: // %bb8
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: cbz w19, .LBB0_1
; CHECK-NEXT: // %bb.2: // %bb8
; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: cmp w19, #39
; CHECK-NEXT: b.eq .LBB0_6
; CHECK-NEXT: // %bb.3: // %bb8
; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: cmp w19, #34
; CHECK-NEXT: b.eq .LBB0_6
; CHECK-NEXT: // %bb.4: // %bb8
; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: cmp w19, #10
; CHECK-NEXT: b.ne .LBB0_1
; CHECK-NEXT: // %bb.5: // %bb9
; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: str wzr, [x20]
; CHECK-NEXT: b .LBB0_1
; CHECK-NEXT: .LBB0_6: // %bb10
; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: mov w0, w23
; CHECK-NEXT: mov x1, x21
; CHECK-NEXT: str wzr, [x22]
; CHECK-NEXT: bl foo
; CHECK-NEXT: b .LBB0_1
bb:
br label %bb7

bb7: ; preds = %bb13, %bb
%phi = phi i32 [ 0, %bb ], [ %spec.select, %bb13 ]
br label %bb8

bb8: ; preds = %bb10, %bb9, %bb8, %bb7
switch i32 %arg1, label %bb8 [
i32 10, label %bb9
i32 1, label %bb16
i32 0, label %bb13
i32 39, label %bb10
i32 34, label %bb10
]

bb9: ; preds = %bb8
store i32 0, ptr %arg2, align 4
br label %bb8

bb10: ; preds = %bb8, %bb8
store i32 0, ptr %arg4, align 4
tail call void @foo(i32 %arg5, ptr %arg3)
br label %bb8

bb13: ; preds = %bb8
%not.arg6 = xor i1 %arg6, true
%spec.select = zext i1 %not.arg6 to i32
br label %bb7

bb16: ; preds = %bb8
unreachable
}

define void @optspeed(i32 %arg, i32 %arg1, ptr %arg2, ptr %arg3, ptr %arg4, i32 %arg5, i1 %arg6) {
; CHECK-LABEL: optspeed:
; CHECK: // %bb.0: // %bb
; CHECK-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: .cfi_offset w19, -8
; CHECK-NEXT: .cfi_offset w20, -16
; CHECK-NEXT: .cfi_offset w21, -24
; CHECK-NEXT: .cfi_offset w22, -32
; CHECK-NEXT: .cfi_offset w23, -40
; CHECK-NEXT: .cfi_offset w30, -48
; CHECK-NEXT: mov w22, w5
; CHECK-NEXT: mov x21, x4
; CHECK-NEXT: mov x20, x3
; CHECK-NEXT: mov x23, x2
; CHECK-NEXT: mov w19, w1
; CHECK-NEXT: b .LBB1_2
; CHECK-NEXT: .LBB1_1: // %bb10
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: mov w0, w22
; CHECK-NEXT: mov x1, x20
; CHECK-NEXT: str wzr, [x21]
; CHECK-NEXT: bl foo
; CHECK-NEXT: .LBB1_2: // %bb8
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: cmp w19, #33
; CHECK-NEXT: b.gt .LBB1_6
; CHECK-NEXT: // %bb.3: // %bb8
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: cbz w19, .LBB1_2
; CHECK-NEXT: // %bb.4: // %bb8
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: cmp w19, #10
; CHECK-NEXT: b.ne .LBB1_2
; CHECK-NEXT: // %bb.5: // %bb9
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: str wzr, [x23]
; CHECK-NEXT: b .LBB1_2
; CHECK-NEXT: .LBB1_6: // %bb8
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: cmp w19, #34
; CHECK-NEXT: b.eq .LBB1_1
; CHECK-NEXT: // %bb.7: // %bb8
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: cmp w19, #39
; CHECK-NEXT: b.eq .LBB1_1
; CHECK-NEXT: b .LBB1_2
bb:
br label %bb7

bb7: ; preds = %bb13, %bb
%phi = phi i32 [ 0, %bb ], [ %spec.select, %bb13 ]
br label %bb8

bb8: ; preds = %bb10, %bb9, %bb8, %bb7
switch i32 %arg1, label %bb8 [
i32 10, label %bb9
i32 1, label %bb16
i32 0, label %bb13
i32 39, label %bb10
i32 34, label %bb10
]

bb9: ; preds = %bb8
store i32 0, ptr %arg2, align 4
br label %bb8

bb10: ; preds = %bb8, %bb8
store i32 0, ptr %arg4, align 4
tail call void @foo(i32 %arg5, ptr %arg3)
br label %bb8

bb13: ; preds = %bb8
%not.arg6 = xor i1 %arg6, true
%spec.select = zext i1 %not.arg6 to i32
br label %bb7

bb16: ; preds = %bb8
unreachable
}
371 changes: 371 additions & 0 deletions llvm/test/CodeGen/AArch64/rqshrn.ll

Large diffs are not rendered by default.

11 changes: 5 additions & 6 deletions llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,17 @@

define void @s116_modified(ptr %a) {
; CHECK-LABEL: @s116_modified(
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 3
; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 2
; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[GEP1]], i64 3
; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[A]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[LD0]], i32 1
; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2)
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> <i32 0, i32 0, i32 1, i32 2>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> <i32 1, i32 1, i32 5, i32 6>
; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]]
; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[A]], align 4
; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[GEP1]], align 4
; CHECK-NEXT: ret void
;
%gep1 = getelementptr inbounds float, ptr %a, i64 1
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ define void @vec3_vectorize_call(ptr %Colour, float %0) {
; NON-POWER-OF-2-NEXT: entry:
; NON-POWER-OF-2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
; NON-POWER-OF-2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2
; NON-POWER-OF-2-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
; NON-POWER-OF-2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> <i32 3, i32 4, i32 2>
; NON-POWER-OF-2-NEXT: [[TMP4:%.*]] = call <3 x float> @llvm.vector.insert.v3f32.v2f32(<3 x float> [[TMP2]], <2 x float> [[TMP1]], i64 0)
; NON-POWER-OF-2-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
; NON-POWER-OF-2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4
; NON-POWER-OF-2-NEXT: ret void
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,21 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) {
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float>
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[Y0:%.*]] = getelementptr i8, ptr [[RC21]], i64 8
; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[Y0]], align 4
; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[I7]], align 4
; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[RC21]], align 4
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 2
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 3
; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP8]], i64 0)
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 2
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i32 3
; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP11]], i64 0)
; CHECK-NEXT: [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP13]], zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = fcmp olt <4 x float> [[TMP5]], zeroinitializer
; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP5]], <4 x float> zeroinitializer
; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP12]], <4 x float> zeroinitializer, <4 x float> [[TMP15]]
; CHECK-NEXT: store <4 x float> [[TMP16]], ptr [[RC21]], align 4
; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
; CHECK-NEXT: store <4 x float> [[TMP27]], ptr [[RC21]], align 4
; CHECK-NEXT: br label [[IF_END:%.*]]
; CHECK: entry.if.end72_crit_edge:
; CHECK-NEXT: br label [[IF_END72:%.*]]
Expand All @@ -46,8 +48,7 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) {
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP25:%.*]] = mul <4 x i32> [[TMP23]], [[TMP24]]
; CHECK-NEXT: [[TMP26:%.*]] = sitofp <4 x i32> [[TMP25]] to <4 x float>
; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
; CHECK-NEXT: store <4 x float> [[TMP27]], ptr [[RC21]], align 4
; CHECK-NEXT: store <4 x float> [[TMP26]], ptr [[RC21]], align 4
; CHECK-NEXT: ret void
;
entry:
Expand Down
32 changes: 8 additions & 24 deletions llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
Original file line number Diff line number Diff line change
Expand Up @@ -318,22 +318,14 @@ entry:
define float @f(ptr nocapture readonly %x) {
; CHECK-LABEL: @f(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
; CHECK-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
; CHECK-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
; CHECK-NEXT: ret float [[OP_RDX]]
;
; THRESHOLD-LABEL: @f(
; THRESHOLD-NEXT: entry:
; THRESHOLD-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
; THRESHOLD-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
; THRESHOLD-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
; THRESHOLD-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
; THRESHOLD-NEXT: ret float [[OP_RDX]]
;
entry:
Expand Down Expand Up @@ -606,18 +598,14 @@ define float @loadadd31(ptr nocapture readonly %x) {
; CHECK-LABEL: @loadadd31(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
Expand All @@ -627,18 +615,14 @@ define float @loadadd31(ptr nocapture readonly %x) {
; THRESHOLD-LABEL: @loadadd31(
; THRESHOLD-NEXT: entry:
; THRESHOLD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
; THRESHOLD-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
; THRESHOLD-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
; THRESHOLD-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
; THRESHOLD-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
; THRESHOLD-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1013,11 +1013,11 @@ define i32 @maxi8_wrong_parent(i32) {
; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16
; THRESH-NEXT: br label [[PP:%.*]]
; THRESH: pp:
; THRESH-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
; THRESH-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16
; THRESH-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP4]], i64 4)
; THRESH-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 0)
; THRESH-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 2)
; THRESH-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
; THRESH-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
; THRESH-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP3]], i64 0)
; THRESH-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP4]], i64 4)
; THRESH-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP2]], i64 6)
; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP7]])
; THRESH-NEXT: ret i32 [[TMP8]]
;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,19 @@ define void @e(ptr %c, i64 %0) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[C]], align 8
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[TMP1]], i64 96
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i8, ptr [[TMP1]], i64 112
; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX1]], align 8
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[TMP1]], i64 104
; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[C]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[TMP18:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX5]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x ptr> [[TMP18]], <2 x ptr> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP3]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x ptr> [[TMP5]], <2 x ptr> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <6 x ptr> poison, ptr [[TMP2]], i32 2
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <6 x ptr> [[TMP7]], ptr [[TMP1]], i32 3
; CHECK-NEXT: [[TMP9:%.*]] = call <6 x ptr> @llvm.vector.insert.v6p0.v2p0(<6 x ptr> [[TMP8]], <2 x ptr> [[TMP4]], i64 0)
; CHECK-NEXT: [[TMP10:%.*]] = call <6 x ptr> @llvm.vector.insert.v6p0.v2p0(<6 x ptr> [[TMP9]], <2 x ptr> [[TMP6]], i64 4)
; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint <6 x ptr> [[TMP10]] to <6 x i64>
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x i64> [[TMP11]], <6 x i64> poison, <32 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5>
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x i64> [[TMP11]], <6 x i64> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5>
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <32 x i64> poison, i64 [[TMP0]], i32 0
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <32 x i64> [[TMP13]], <32 x i64> poison, <32 x i32> zeroinitializer
; CHECK-NEXT: [[TMP15:%.*]] = or <32 x i64> [[TMP14]], [[TMP12]]
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ define void @vec3_vectorize_call(ptr %Colour, float %0) {
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2
; NON-POW2-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> <i32 3, i32 4, i32 2>
; NON-POW2-NEXT: [[TMP4:%.*]] = call <3 x float> @llvm.vector.insert.v3f32.v2f32(<3 x float> [[TMP2]], <2 x float> [[TMP1]], i64 0)
; NON-POW2-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
; NON-POW2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4
; NON-POW2-NEXT: ret void
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,18 @@ define void @test(ptr %a, ptr %b) {
; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr null, align 4
; CHECK-NEXT: [[ARRAYIDX120:%.*]] = getelementptr [4 x float], ptr [[B:%.*]], i64 0, i64 3
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX120]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 3
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 0>
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 2
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr null, align 4
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i32 2
; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP6]], <2 x float> [[TMP1]], i64 0)
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP5]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP9]], [[TMP10]]
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP9]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP8]], [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x float> [[TMP11]], zeroinitializer
; CHECK-NEXT: store <4 x float> [[TMP12]], ptr [[RESULT]], align 4
; CHECK-NEXT: br label [[FOR_BODY]]
Expand Down