From bd10f494d511eaf4aa689051cbf458435a0eb652 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 5 Nov 2025 14:01:33 +0000
Subject: [PATCH] [CodeGenPrepare][X86] hasMultipleConditionRegisters - don't
 sink larger than legal integer comparisons

x86 will expand large integer comparisons - we're better off just storing the result instead of recomputing the comparison split across multiple registers.

I've tweaked hasMultipleConditionRegisters to (optionally) take the CmpInst operand type as well as the result type.

I don't like the way that X86 and other targets are overriding hasMultipleConditionRegisters just to avoid comparison sinking, but I'm not sure if there's a better approach to take - especially as shouldNormalizeToSelectSequence uses it as well. Comments welcome!

Fixes #166534
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  7 +-
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  6 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  5 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |  3 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  3 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.h     |  3 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  8 ++
 llvm/lib/Target/X86/X86ISelLowering.h         |  5 +
 .../X86/2012-01-10-UndefExceptionEdge.ll      | 55 +++++------
 llvm/test/CodeGen/X86/pr166534.ll             | 99 +++++--------------
 10 files changed, 81 insertions(+), 113 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 78f63b4406eb0..f5a062fa1b8bd 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -527,7 +527,10 @@ class LLVM_ABI TargetLoweringBase {
   /// and conditional branches. With multiple condition registers, the code
   /// generator will not aggressively sink comparisons into the blocks of their
   /// users.
-  virtual bool hasMultipleConditionRegisters(EVT VT) const { return false; }
+  virtual bool hasMultipleConditionRegisters(EVT ResVT,
+                                             std::optional<EVT> CmpVT) const {
+    return false;
+  }
 
   /// Return true if the target has BitExtract instructions.
   bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }
@@ -2493,7 +2496,7 @@ class LLVM_ABI TargetLoweringBase {
                                                EVT VT) const {
     // If a target has multiple condition registers, then it likely has logical
     // operations on those registers.
-    if (hasMultipleConditionRegisters(VT))
+    if (hasMultipleConditionRegisters(VT, std::nullopt))
       return false;
     // Only do the transform if the value won't be split into multiple
     // registers.
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 0309e225d9df4..3f2c6bf18105b 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1840,7 +1840,11 @@ bool CodeGenPrepare::unfoldPowerOf2Test(CmpInst *Cmp) {
 ///
 /// Return true if any changes are made.
 static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
-  if (TLI.hasMultipleConditionRegisters(EVT::getEVT(Cmp->getType())))
+  std::optional<EVT> CmpVT;
+  if (Cmp->getOperand(0)->getType()->isIntegerTy())
+    CmpVT = EVT::getEVT(Cmp->getOperand(0)->getType());
+
+  if (TLI.hasMultipleConditionRegisters(EVT::getEVT(Cmp->getType()), CmpVT))
     return false;
 
   // Avoid sinking soft-FP comparisons, since this can move them into a loop.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 70bfae717fb76..9111c8320d698 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -915,8 +915,9 @@ class AArch64TargetLowering : public TargetLowering {
     return VecOp.getOpcode() == ISD::SETCC;
   }
 
-  bool hasMultipleConditionRegisters(EVT VT) const override {
-    return VT.isScalableVector();
+  bool hasMultipleConditionRegisters(EVT ResVT,
+                                     std::optional<EVT> CmpVT) const override {
+    return ResVT.isScalableVector();
   }
 };
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index bdaf48652d107..d49e38fae8147 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -395,7 +395,8 @@ class AMDGPUTargetLowering : public TargetLowering {
     return MVT::i32;
   }
 
-  bool hasMultipleConditionRegisters(EVT VT) const override {
+  bool hasMultipleConditionRegisters(EVT ResVT,
+                                     std::optional<EVT> CmpVT) const override {
     // FIXME: This is only partially true. If we have to do vector compares, any
     // SGPR pair can be a condition register. If we have a uniform condition, we
     // are better off doing SALU operations, where there is only one SCC. For
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 20fc849ea4aa5..9f7ce41e399d5 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -20116,6 +20116,7 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
       Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
 }
 
-bool PPCTargetLowering::hasMultipleConditionRegisters(EVT VT) const {
+bool PPCTargetLowering::hasMultipleConditionRegisters(
+    EVT ResVT, std::optional<EVT> CmpVT) const {
   return Subtarget.useCRBits();
 }
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 880aca751d7d6..68ce19ac75b31 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1194,7 +1194,8 @@ namespace llvm {
                                   bool IsVarArg) const;
     bool supportsTailCallFor(const CallBase *CB) const;
 
-    bool hasMultipleConditionRegisters(EVT VT) const override;
+    bool hasMultipleConditionRegisters(EVT ResVT,
+                                       std::optional<EVT> CmpVT) const override;
 
   private:
     struct ReuseLoadInfo {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4d44227b3ecd4..d7d3fab496364 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3403,6 +3403,14 @@ bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
   return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
 }
 
+bool X86TargetLowering::hasMultipleConditionRegisters(
+    EVT ResVT, std::optional<EVT> CmpVT) const {
+  if (CmpVT.has_value())
+    return CmpVT->isScalarInteger() &&
+           CmpVT->getSizeInBits() > (Subtarget.is64Bit() ? 64 : 32);
+  return TargetLowering::hasMultipleConditionRegisters(ResVT, CmpVT);
+}
+
 bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
                                              bool) const {
   // TODO: Allow vectors?
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f65942b4..612ca589ee139 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1542,6 +1542,11 @@ namespace llvm {
     /// supported.
     bool shouldScalarizeBinop(SDValue) const override;
 
+    /// If returns true the code generator will not aggressively sink
+    /// comparisons into the blocks of their users.
+    bool hasMultipleConditionRegisters(EVT ResVT,
+                                       std::optional<EVT> CmpVT) const override;
+
     /// Extract of a scalar FP value from index 0 of a vector is free.
     bool isExtractVecEltCheap(EVT VT, unsigned Index) const override {
       EVT EltVT = VT.getScalarType();
diff --git a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
index 1962ddebc2115..5a5feaa7734e9 100644
--- a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
+++ b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
@@ -34,16 +34,16 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
 ; CHECK-NEXT:    .cfi_offset %edi, -16
 ; CHECK-NEXT:    .cfi_offset %ebx, -12
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:  Ltmp0:
-; CHECK-NEXT:    ## implicit-def: $ebx
+; CHECK-NEXT:  Ltmp0: ## EH_LABEL
+; CHECK-NEXT:    ## implicit-def: $edi
 ; CHECK-NEXT:    calll __Znam
-; CHECK-NEXT:  Ltmp1:
+; CHECK-NEXT:  Ltmp1: ## EH_LABEL
 ; CHECK-NEXT:  ## %bb.1: ## %bb11
 ; CHECK-NEXT:    movl %eax, %esi
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    movb $1, %bl
+; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    jne LBB0_2
 ; CHECK-NEXT:  ## %bb.7: ## %bb31
 ; CHECK-NEXT:    ## implicit-def: $eax
@@ -53,23 +53,20 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
 ; CHECK-NEXT:    ## Child Loop BB0_13 Depth 2
 ; CHECK-NEXT:    ## Child Loop BB0_16 Depth 3
 ; CHECK-NEXT:    ## Child Loop BB0_21 Depth 2
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    jne LBB0_9
 ; CHECK-NEXT:  ## %bb.10: ## %bb41
 ; CHECK-NEXT:    ## in Loop: Header=BB0_8 Depth=1
-; CHECK-NEXT:  Ltmp2:
+; CHECK-NEXT:  Ltmp2: ## EH_LABEL
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %esi, (%esp)
 ; CHECK-NEXT:    calll _Pjii
-; CHECK-NEXT:  Ltmp3:
+; CHECK-NEXT:  Ltmp3: ## EH_LABEL
 ; CHECK-NEXT:  ## %bb.11: ## %bb42
 ; CHECK-NEXT:    ## in Loop: Header=BB0_8 Depth=1
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    decl %eax
-; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    jne LBB0_18
 ; CHECK-NEXT:  ## %bb.12: ## %bb45.preheader
 ; CHECK-NEXT:    ## in Loop: Header=BB0_8 Depth=1
@@ -78,8 +75,7 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
 ; CHECK-NEXT:    ## Parent Loop BB0_8 Depth=1
 ; CHECK-NEXT:    ## => This Loop Header: Depth=2
 ; CHECK-NEXT:    ## Child Loop BB0_16 Depth 3
-; CHECK-NEXT:    movb $1, %cl
-; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    jne LBB0_19
 ; CHECK-NEXT:  ## %bb.14: ## %bb48
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=2
@@ -88,14 +84,14 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=2
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    movl %esi, %edx
-; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
 ; CHECK-NEXT:  LBB0_16: ## %bb49
 ; CHECK-NEXT:    ## Parent Loop BB0_8 Depth=1
 ; CHECK-NEXT:    ## Parent Loop BB0_13 Depth=2
 ; CHECK-NEXT:    ## => This Inner Loop Header: Depth=3
 ; CHECK-NEXT:    incl %ecx
 ; CHECK-NEXT:    addl $4, %edx
-; CHECK-NEXT:    decl %ebx
+; CHECK-NEXT:    decl %edi
 ; CHECK-NEXT:    jne LBB0_16
 ; CHECK-NEXT:  LBB0_17: ## %bb57
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=2
@@ -107,13 +103,12 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, (%esp)
 ; CHECK-NEXT:    calll ___bzero
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    jne LBB0_22
 ; CHECK-NEXT:  ## %bb.20: ## %bb61.preheader
 ; CHECK-NEXT:    ## in Loop: Header=BB0_8 Depth=1
 ; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; CHECK-NEXT:  LBB0_21: ## %bb61
 ; CHECK-NEXT:    ## Parent Loop BB0_8 Depth=1
 ; CHECK-NEXT:    ## => This Inner Loop Header: Depth=2
@@ -126,24 +121,24 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
 ; CHECK-NEXT:    decl {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; CHECK-NEXT:    jmp LBB0_8
 ; CHECK-NEXT:  LBB0_18: ## %bb43
-; CHECK-NEXT:  Ltmp5:
-; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:  Ltmp5: ## EH_LABEL
+; CHECK-NEXT:    movl %esi, %edi
 ; CHECK-NEXT:    calll _OnOverFlow
-; CHECK-NEXT:  Ltmp6:
+; CHECK-NEXT:  Ltmp6: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB0_3
 ; CHECK-NEXT:  LBB0_2: ## %bb29
-; CHECK-NEXT:  Ltmp7:
-; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:  Ltmp7: ## EH_LABEL
+; CHECK-NEXT:    movl %esi, %edi
 ; CHECK-NEXT:    calll _OnOverFlow
-; CHECK-NEXT:  Ltmp8:
+; CHECK-NEXT:  Ltmp8: ## EH_LABEL
 ; CHECK-NEXT:  LBB0_3: ## %bb30
 ; CHECK-NEXT:    ud2
 ; CHECK-NEXT:  LBB0_4: ## %bb20.loopexit
-; CHECK-NEXT:  Ltmp4:
+; CHECK-NEXT:  Ltmp4: ## EH_LABEL
 ; CHECK-NEXT:  LBB0_9:
-; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %esi, %edi
 ; CHECK-NEXT:  LBB0_6: ## %bb23
-; CHECK-NEXT:    testl %ebx, %ebx
+; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    addl $28, %esp
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
@@ -151,7 +146,7 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl
 ; CHECK-NEXT:  LBB0_5: ## %bb20.loopexit.split-lp
-; CHECK-NEXT:  Ltmp9:
+; CHECK-NEXT:  Ltmp9: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB0_6
 ; CHECK-NEXT:  Lfunc_end0:
 bb:
diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll
index aef44cc3e40d0..a311acb4c0409 100644
--- a/llvm/test/CodeGen/X86/pr166534.ll
+++ b/llvm/test/CodeGen/X86/pr166534.ll
@@ -1,108 +1,57 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=SSE2
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE4
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX2
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX
 
 define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
 ; SSE2-LABEL: pr166534:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movq (%rdi), %rax
-; SSE2-NEXT:    movq 8(%rdi), %r8
 ; SSE2-NEXT:    movdqu (%rdi), %xmm0
-; SSE2-NEXT:    movq (%rsi), %r9
-; SSE2-NEXT:    movq 8(%rsi), %rdi
 ; SSE2-NEXT:    movdqu (%rsi), %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %esi
-; SSE2-NEXT:    xorl %r10d, %r10d
+; SSE2-NEXT:    xorl %eax, %eax
 ; SSE2-NEXT:    cmpl $65535, %esi # imm = 0xFFFF
-; SSE2-NEXT:    sete %r10b
-; SSE2-NEXT:    orq %r10, (%rdx)
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    orq %rax, (%rdx)
 ; SSE2-NEXT:    cmpl $65535, %esi # imm = 0xFFFF
 ; SSE2-NEXT:    jne .LBB0_2
 ; SSE2-NEXT:  # %bb.1: # %if.then
-; SSE2-NEXT:    xorq %r9, %rax
-; SSE2-NEXT:    xorq %rdi, %r8
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    orq %rax, %r8
-; SSE2-NEXT:    sete %dl
-; SSE2-NEXT:    orq %rdx, (%rcx)
+; SSE2-NEXT:    orq %rax, (%rcx)
 ; SSE2-NEXT:  .LBB0_2: # %if.end
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: pr166534:
 ; SSE4:       # %bb.0: # %entry
-; SSE4-NEXT:    movq (%rdi), %rax
-; SSE4-NEXT:    movq 8(%rdi), %r8
 ; SSE4-NEXT:    movdqu (%rdi), %xmm0
-; SSE4-NEXT:    movq (%rsi), %r9
-; SSE4-NEXT:    movq 8(%rsi), %rdi
 ; SSE4-NEXT:    movdqu (%rsi), %xmm1
 ; SSE4-NEXT:    pxor %xmm0, %xmm1
-; SSE4-NEXT:    xorl %esi, %esi
+; SSE4-NEXT:    xorl %eax, %eax
 ; SSE4-NEXT:    ptest %xmm1, %xmm1
-; SSE4-NEXT:    sete %sil
-; SSE4-NEXT:    orq %rsi, (%rdx)
+; SSE4-NEXT:    sete %al
+; SSE4-NEXT:    orq %rax, (%rdx)
 ; SSE4-NEXT:    ptest %xmm1, %xmm1
 ; SSE4-NEXT:    jne .LBB0_2
 ; SSE4-NEXT:  # %bb.1: # %if.then
-; SSE4-NEXT:    xorq %r9, %rax
-; SSE4-NEXT:    xorq %rdi, %r8
-; SSE4-NEXT:    xorl %edx, %edx
-; SSE4-NEXT:    orq %rax, %r8
-; SSE4-NEXT:    sete %dl
-; SSE4-NEXT:    orq %rdx, (%rcx)
+; SSE4-NEXT:    orq %rax, (%rcx)
 ; SSE4-NEXT:  .LBB0_2: # %if.end
 ; SSE4-NEXT:    retq
 ;
-; AVX2-LABEL: pr166534:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    movq (%rdi), %rax
-; AVX2-NEXT:    movq 8(%rdi), %r8
-; AVX2-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX2-NEXT:    movq (%rsi), %rdi
-; AVX2-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; AVX2-NEXT:    movq 8(%rsi), %rsi
-; AVX2-NEXT:    xorl %r9d, %r9d
-; AVX2-NEXT:    vptest %xmm0, %xmm0
-; AVX2-NEXT:    sete %r9b
-; AVX2-NEXT:    orq %r9, (%rdx)
-; AVX2-NEXT:    vptest %xmm0, %xmm0
-; AVX2-NEXT:    jne .LBB0_2
-; AVX2-NEXT:  # %bb.1: # %if.then
-; AVX2-NEXT:    xorq %rdi, %rax
-; AVX2-NEXT:    xorq %rsi, %r8
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    orq %rax, %r8
-; AVX2-NEXT:    sete %dl
-; AVX2-NEXT:    orq %rdx, (%rcx)
-; AVX2-NEXT:  .LBB0_2: # %if.end
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: pr166534:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    movq (%rdi), %rax
-; AVX512-NEXT:    movq 8(%rdi), %r8
-; AVX512-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512-NEXT:    movq (%rsi), %r9
-; AVX512-NEXT:    movq 8(%rsi), %rdi
-; AVX512-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; AVX512-NEXT:    xorl %esi, %esi
-; AVX512-NEXT:    vptest %xmm0, %xmm0
-; AVX512-NEXT:    sete %sil
-; AVX512-NEXT:    orq %rsi, (%rdx)
-; AVX512-NEXT:    vptest %xmm0, %xmm0
-; AVX512-NEXT:    jne .LBB0_2
-; AVX512-NEXT:  # %bb.1: # %if.then
-; AVX512-NEXT:    xorq %r9, %rax
-; AVX512-NEXT:    xorq %rdi, %r8
-; AVX512-NEXT:    xorl %edx, %edx
-; AVX512-NEXT:    orq %rax, %r8
-; AVX512-NEXT:    sete %dl
-; AVX512-NEXT:    orq %rdx, (%rcx)
-; AVX512-NEXT:  .LBB0_2: # %if.end
-; AVX512-NEXT:    retq
+; AVX-LABEL: pr166534:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vptest %xmm0, %xmm0
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    orq %rax, (%rdx)
+; AVX-NEXT:    vptest %xmm0, %xmm0
+; AVX-NEXT:    jne .LBB0_2
+; AVX-NEXT:  # %bb.1: # %if.then
+; AVX-NEXT:    orq %rax, (%rcx)
+; AVX-NEXT:  .LBB0_2: # %if.end
+; AVX-NEXT:    retq
 entry:
   %a = load i128, ptr %pa, align 8
   %b = load i128, ptr %pb, align 8