diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index b229659415d55..62ded89a7d5de 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -527,7 +527,10 @@ class LLVM_ABI TargetLoweringBase { /// and conditional branches. With multiple condition registers, the code /// generator will not aggressively sink comparisons into the blocks of their /// users. - virtual bool hasMultipleConditionRegisters(EVT VT) const { return false; } + virtual bool hasMultipleConditionRegisters(EVT ResVT, + std::optional CmpVT) const { + return false; + } /// Return true if the target has BitExtract instructions. bool hasExtractBitsInsn() const { return HasExtractBitsInsn; } @@ -2493,7 +2496,7 @@ class LLVM_ABI TargetLoweringBase { EVT VT) const { // If a target has multiple condition registers, then it likely has logical // operations on those registers. - if (hasMultipleConditionRegisters(VT)) + if (hasMultipleConditionRegisters(VT, std::nullopt)) return false; // Only do the transform if the value won't be split into multiple // registers. diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 0309e225d9df4..3f2c6bf18105b 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -1840,7 +1840,11 @@ bool CodeGenPrepare::unfoldPowerOf2Test(CmpInst *Cmp) { /// /// Return true if any changes are made. static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) { - if (TLI.hasMultipleConditionRegisters(EVT::getEVT(Cmp->getType()))) + std::optional CmpVT; + if (Cmp->getOperand(0)->getType()->isIntegerTy()) + CmpVT = EVT::getEVT(Cmp->getOperand(0)->getType()); + + if (TLI.hasMultipleConditionRegisters(EVT::getEVT(Cmp->getType()), CmpVT)) return false; // Avoid sinking soft-FP comparisons, since this can move them into a loop. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 70bfae717fb76..9111c8320d698 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -915,8 +915,9 @@ class AArch64TargetLowering : public TargetLowering { return VecOp.getOpcode() == ISD::SETCC; } - bool hasMultipleConditionRegisters(EVT VT) const override { - return VT.isScalableVector(); + bool hasMultipleConditionRegisters(EVT ResVT, + std::optional CmpVT) const override { + return ResVT.isScalableVector(); } }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index bdaf48652d107..d49e38fae8147 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -395,7 +395,8 @@ class AMDGPUTargetLowering : public TargetLowering { return MVT::i32; } - bool hasMultipleConditionRegisters(EVT VT) const override { + bool hasMultipleConditionRegisters(EVT ResVT, + std::optional CmpVT) const override { // FIXME: This is only partially true. If we have to do vector compares, any // SGPR pair can be a condition register. If we have a uniform condition, we // are better off doing SALU operations, where there is only one SCC. For diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 20fc849ea4aa5..9f7ce41e399d5 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -20116,6 +20116,7 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); } -bool PPCTargetLowering::hasMultipleConditionRegisters(EVT VT) const { +bool PPCTargetLowering::hasMultipleConditionRegisters( + EVT ResVT, std::optional CmpVT) const { return Subtarget.useCRBits(); } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 880aca751d7d6..68ce19ac75b31 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1194,7 +1194,8 @@ namespace llvm { bool IsVarArg) const; bool supportsTailCallFor(const CallBase *CB) const; - bool hasMultipleConditionRegisters(EVT VT) const override; + bool hasMultipleConditionRegisters(EVT ResVT, + std::optional CmpVT) const override; private: struct ReuseLoadInfo { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4d44227b3ecd4..d7d3fab496364 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3403,6 +3403,14 @@ bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const { return isOperationLegalOrCustomOrPromote(Opc, ScalarVT); } +bool X86TargetLowering::hasMultipleConditionRegisters( + EVT ResVT, std::optional CmpVT) const { + if (CmpVT.has_value()) + return CmpVT->isScalarInteger() && + CmpVT->getSizeInBits() > (Subtarget.is64Bit() ? 64 : 32); + return TargetLowering::hasMultipleConditionRegisters(ResVT, CmpVT); +} + bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT, bool) const { // TODO: Allow vectors? diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index b7151f65942b4..612ca589ee139 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1542,6 +1542,11 @@ namespace llvm { /// supported. bool shouldScalarizeBinop(SDValue) const override; + /// If returns true the code generator will not aggressively sink + /// comparisons into the blocks of their users. + bool hasMultipleConditionRegisters(EVT ResVT, + std::optional CmpVT) const override; + /// Extract of a scalar FP value from index 0 of a vector is free. bool isExtractVecEltCheap(EVT VT, unsigned Index) const override { EVT EltVT = VT.getScalarType(); diff --git a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll index 1962ddebc2115..5a5feaa7734e9 100644 --- a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll +++ b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll @@ -34,16 +34,16 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: .cfi_offset %edi, -16 ; CHECK-NEXT: .cfi_offset %ebx, -12 ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: Ltmp0: -; CHECK-NEXT: ## implicit-def: $ebx +; CHECK-NEXT: Ltmp0: ## EH_LABEL +; CHECK-NEXT: ## implicit-def: $edi ; CHECK-NEXT: calll __Znam -; CHECK-NEXT: Ltmp1: +; CHECK-NEXT: Ltmp1: ## EH_LABEL ; CHECK-NEXT: ## %bb.1: ## %bb11 ; CHECK-NEXT: movl %eax, %esi -; CHECK-NEXT: movb $1, %al -; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: movb $1, %bl +; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: jne LBB0_2 ; CHECK-NEXT: ## %bb.7: ## %bb31 ; CHECK-NEXT: ## implicit-def: $eax @@ -53,23 +53,20 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: ## Child Loop BB0_13 Depth 2 ; CHECK-NEXT: ## Child Loop BB0_16 Depth 3 ; CHECK-NEXT: ## Child Loop BB0_21 Depth 2 -; CHECK-NEXT: movb $1, %al -; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: jne LBB0_9 ; CHECK-NEXT: ## %bb.10: ## %bb41 ; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1 -; CHECK-NEXT: Ltmp2: +; CHECK-NEXT: Ltmp2: ## EH_LABEL ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %esi, (%esp) ; CHECK-NEXT: calll _Pjii -; CHECK-NEXT: Ltmp3: +; CHECK-NEXT: Ltmp3: ## EH_LABEL ; CHECK-NEXT: ## %bb.11: ## %bb42 ; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: decl %eax -; CHECK-NEXT: testl %eax, %eax +; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: jne LBB0_18 ; CHECK-NEXT: ## %bb.12: ## %bb45.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1 @@ -78,8 +75,7 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: ## Parent Loop BB0_8 Depth=1 ; CHECK-NEXT: ## => This Loop Header: Depth=2 ; CHECK-NEXT: ## Child Loop BB0_16 Depth 3 -; CHECK-NEXT: movb $1, %cl -; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: jne LBB0_19 ; CHECK-NEXT: ## %bb.14: ## %bb48 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=2 @@ -88,14 +84,14 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=2 ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: movl %esi, %edx -; CHECK-NEXT: movl %edi, %ebx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload ; CHECK-NEXT: LBB0_16: ## %bb49 ; CHECK-NEXT: ## Parent Loop BB0_8 Depth=1 ; CHECK-NEXT: ## Parent Loop BB0_13 Depth=2 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=3 ; CHECK-NEXT: incl %ecx ; CHECK-NEXT: addl $4, %edx -; CHECK-NEXT: decl %ebx +; CHECK-NEXT: decl %edi ; CHECK-NEXT: jne LBB0_16 ; CHECK-NEXT: LBB0_17: ## %bb57 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=2 @@ -107,13 +103,12 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, (%esp) ; CHECK-NEXT: calll ___bzero -; CHECK-NEXT: movb $1, %al -; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: jne LBB0_22 ; CHECK-NEXT: ## %bb.20: ## %bb61.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1 ; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; CHECK-NEXT: LBB0_21: ## %bb61 ; CHECK-NEXT: ## Parent Loop BB0_8 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 @@ -126,24 +121,24 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: decl {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; CHECK-NEXT: jmp LBB0_8 ; CHECK-NEXT: LBB0_18: ## %bb43 -; CHECK-NEXT: Ltmp5: -; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: Ltmp5: ## EH_LABEL +; CHECK-NEXT: movl %esi, %edi ; CHECK-NEXT: calll _OnOverFlow -; CHECK-NEXT: Ltmp6: +; CHECK-NEXT: Ltmp6: ## EH_LABEL ; CHECK-NEXT: jmp LBB0_3 ; CHECK-NEXT: LBB0_2: ## %bb29 -; CHECK-NEXT: Ltmp7: -; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: Ltmp7: ## EH_LABEL +; CHECK-NEXT: movl %esi, %edi ; CHECK-NEXT: calll _OnOverFlow -; CHECK-NEXT: Ltmp8: +; CHECK-NEXT: Ltmp8: ## EH_LABEL ; CHECK-NEXT: LBB0_3: ## %bb30 ; CHECK-NEXT: ud2 ; CHECK-NEXT: LBB0_4: ## %bb20.loopexit -; CHECK-NEXT: Ltmp4: +; CHECK-NEXT: Ltmp4: ## EH_LABEL ; CHECK-NEXT: LBB0_9: -; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %esi, %edi ; CHECK-NEXT: LBB0_6: ## %bb23 -; CHECK-NEXT: testl %ebx, %ebx +; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: addl $28, %esp ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %edi @@ -151,7 +146,7 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl ; CHECK-NEXT: LBB0_5: ## %bb20.loopexit.split-lp -; CHECK-NEXT: Ltmp9: +; CHECK-NEXT: Ltmp9: ## EH_LABEL ; CHECK-NEXT: jmp LBB0_6 ; CHECK-NEXT: Lfunc_end0: bb: diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll index aef44cc3e40d0..a311acb4c0409 100644 --- a/llvm/test/CodeGen/X86/pr166534.ll +++ b/llvm/test/CodeGen/X86/pr166534.ll @@ -1,108 +1,57 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE2 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE4 -; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX2 -; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) { ; SSE2-LABEL: pr166534: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq (%rdi), %rax -; SSE2-NEXT: movq 8(%rdi), %r8 ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movq (%rsi), %r9 -; SSE2-NEXT: movq 8(%rsi), %rdi ; SSE2-NEXT: movdqu (%rsi), %xmm1 ; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE2-NEXT: pmovmskb %xmm1, %esi -; SSE2-NEXT: xorl %r10d, %r10d +; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF -; SSE2-NEXT: sete %r10b -; SSE2-NEXT: orq %r10, (%rdx) +; SSE2-NEXT: sete %al +; SSE2-NEXT: orq %rax, (%rdx) ; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF ; SSE2-NEXT: jne .LBB0_2 ; SSE2-NEXT: # %bb.1: # %if.then -; SSE2-NEXT: xorq %r9, %rax -; SSE2-NEXT: xorq %rdi, %r8 -; SSE2-NEXT: xorl %edx, %edx -; SSE2-NEXT: orq %rax, %r8 -; SSE2-NEXT: sete %dl -; SSE2-NEXT: orq %rdx, (%rcx) +; SSE2-NEXT: orq %rax, (%rcx) ; SSE2-NEXT: .LBB0_2: # %if.end ; SSE2-NEXT: retq ; ; SSE4-LABEL: pr166534: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movq (%rdi), %rax -; SSE4-NEXT: movq 8(%rdi), %r8 ; SSE4-NEXT: movdqu (%rdi), %xmm0 -; SSE4-NEXT: movq (%rsi), %r9 -; SSE4-NEXT: movq 8(%rsi), %rdi ; SSE4-NEXT: movdqu (%rsi), %xmm1 ; SSE4-NEXT: pxor %xmm0, %xmm1 -; SSE4-NEXT: xorl %esi, %esi +; SSE4-NEXT: xorl %eax, %eax ; SSE4-NEXT: ptest %xmm1, %xmm1 -; SSE4-NEXT: sete %sil -; SSE4-NEXT: orq %rsi, (%rdx) +; SSE4-NEXT: sete %al +; SSE4-NEXT: orq %rax, (%rdx) ; SSE4-NEXT: ptest %xmm1, %xmm1 ; SSE4-NEXT: jne .LBB0_2 ; SSE4-NEXT: # %bb.1: # %if.then -; SSE4-NEXT: xorq %r9, %rax -; SSE4-NEXT: xorq %rdi, %r8 -; SSE4-NEXT: xorl %edx, %edx -; SSE4-NEXT: orq %rax, %r8 -; SSE4-NEXT: sete %dl -; SSE4-NEXT: orq %rdx, (%rcx) +; SSE4-NEXT: orq %rax, (%rcx) ; SSE4-NEXT: .LBB0_2: # %if.end ; SSE4-NEXT: retq ; -; AVX2-LABEL: pr166534: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: movq 8(%rdi), %r8 -; AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; AVX2-NEXT: movq (%rsi), %rdi -; AVX2-NEXT: vpxor (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: movq 8(%rsi), %rsi -; AVX2-NEXT: xorl %r9d, %r9d -; AVX2-NEXT: vptest %xmm0, %xmm0 -; AVX2-NEXT: sete %r9b -; AVX2-NEXT: orq %r9, (%rdx) -; AVX2-NEXT: vptest %xmm0, %xmm0 -; AVX2-NEXT: jne .LBB0_2 -; AVX2-NEXT: # %bb.1: # %if.then -; AVX2-NEXT: xorq %rdi, %rax -; AVX2-NEXT: xorq %rsi, %r8 -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: orq %rax, %r8 -; AVX2-NEXT: sete %dl -; AVX2-NEXT: orq %rdx, (%rcx) -; AVX2-NEXT: .LBB0_2: # %if.end -; AVX2-NEXT: retq -; -; AVX512-LABEL: pr166534: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %r8 -; AVX512-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512-NEXT: movq (%rsi), %r9 -; AVX512-NEXT: movq 8(%rsi), %rdi -; AVX512-NEXT: vpxor (%rsi), %xmm0, %xmm0 -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: vptest %xmm0, %xmm0 -; AVX512-NEXT: sete %sil -; AVX512-NEXT: orq %rsi, (%rdx) -; AVX512-NEXT: vptest %xmm0, %xmm0 -; AVX512-NEXT: jne .LBB0_2 -; AVX512-NEXT: # %bb.1: # %if.then -; AVX512-NEXT: xorq %r9, %rax -; AVX512-NEXT: xorq %rdi, %r8 -; AVX512-NEXT: xorl %edx, %edx -; AVX512-NEXT: orq %rax, %r8 -; AVX512-NEXT: sete %dl -; AVX512-NEXT: orq %rdx, (%rcx) -; AVX512-NEXT: .LBB0_2: # %if.end -; AVX512-NEXT: retq +; AVX-LABEL: pr166534: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0 +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: vptest %xmm0, %xmm0 +; AVX-NEXT: sete %al +; AVX-NEXT: orq %rax, (%rdx) +; AVX-NEXT: vptest %xmm0, %xmm0 +; AVX-NEXT: jne .LBB0_2 +; AVX-NEXT: # %bb.1: # %if.then +; AVX-NEXT: orq %rax, (%rcx) +; AVX-NEXT: .LBB0_2: # %if.end +; AVX-NEXT: retq entry: %a = load i128, ptr %pa, align 8 %b = load i128, ptr %pb, align 8