-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] Try Folding icmp of v8i32 -> fcmp of v8f32 on AVX #82290
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-x86 Author: None (goldsteinn) Changes
Patch is 282.65 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/82290.diff 30 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d9657e1aeb8026..b438700e69d8ca 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -23299,6 +23299,110 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
}
}
+ // We get bad codegen for v8i32 compares on avx targets (without avx2) so if
+ // possible convert to a v8f32 compare.
+ if (VTOp0.getVectorElementType() == MVT::i32 && VTOp0 == MVT::v8i32 &&
+ Subtarget.hasAVX() && !Subtarget.hasAVX2()) {
+ std::optional<KnownBits> KnownOps[2];
+ // Check if an op is known to be in a certain range.
+ auto OpInRange = [&DAG, Op, &KnownOps](unsigned OpNo, bool CmpLT,
+ const APInt Bound) {
+ if (!KnownOps[OpNo].has_value())
+ KnownOps[OpNo] = DAG.computeKnownBits(Op.getOperand(OpNo));
+
+ if (KnownOps[OpNo]->isUnknown())
+ return false;
+
+ std::optional<bool> Res;
+ if (CmpLT)
+ Res = KnownBits::ult(*KnownOps[OpNo], KnownBits::makeConstant(Bound));
+ else
+ Res = KnownBits::ugt(*KnownOps[OpNo], KnownBits::makeConstant(Bound));
+ return Res.has_value() && *Res;
+ };
+
+ bool OkayCvt = false;
+ bool OkayBitcast = false;
+
+ // For cvt up to 1 << (Significand Precision)
+ const APInt MaxConvertableCvt = APInt(32, (1U << 24));
+ // For bitcast up to (and including) first inf representation (0x7f800000)
+ const APInt MaxConvertableBitcast = APInt(32, 0x7f800001);
+
+ // For bitcast we need both lhs/op1 u< MaxConvertableBitcast
+ // NB: It might be worth it to enable to bitcast version for unsigned avx2
+ // comparisons as they typically require multiple instructions to lower
+ // (they don't fit `vpcmpeq`/`vpcmpgt` well).
+ if (OpInRange(1, /*CmpLT*/ true, MaxConvertableBitcast) &&
+ OpInRange(0, /*CmpLT*/ true, MaxConvertableBitcast)) {
+ OkayBitcast = true;
+ }
+ // We want to convert icmp -> fcmp using `sitofp` iff one of the converts
+ // will be constant folded.
+ else if ((DAG.isConstantValueOfAnyType(peekThroughBitcasts(Op1)) ||
+ DAG.isConstantValueOfAnyType(peekThroughBitcasts(Op0)))) {
+ if (isUnsignedIntSetCC(Cond)) {
+ // For cvt + unsigned compare we need both lhs/rhs >= 0 and either lhs
+ // or rhs < MaxConvertableCvt
+
+ if (OpInRange(1, /*CmpLT*/ true, APInt::getSignedMinValue(32)) &&
+ OpInRange(0, /*CmpLT*/ true, APInt::getSignedMinValue(32)) &&
+ (OpInRange(1, /*CmpLT*/ true, MaxConvertableCvt) ||
+ OpInRange(0, /*CmpLT*/ true, MaxConvertableCvt)))
+ OkayCvt = true;
+ } else {
+ // For cvt + signed compare we need abs(lhs) or abs(rhs) <
+ // MaxConvertableCvt
+ if (OpInRange(1, /*CmpLT*/ true, MaxConvertableCvt) ||
+ OpInRange(1, /*CmpLT*/ false, -MaxConvertableCvt) ||
+ OpInRange(0, /*CmpLT*/ true, MaxConvertableCvt) ||
+ OpInRange(0, /*CmpLT*/ false, -MaxConvertableCvt))
+ OkayCvt = true;
+ }
+ }
+
+ if (OkayBitcast || OkayCvt) {
+ switch (Cond) {
+ default:
+ llvm_unreachable("Unexpected SETCC condition");
+ // Get the new FP condition. Note for the unsigned conditions we have
+ // verified its okay to convert to the signed version.
+ case ISD::SETULT:
+ case ISD::SETLT:
+ Cond = ISD::SETOLT;
+ break;
+ case ISD::SETUGT:
+ case ISD::SETGT:
+ Cond = ISD::SETOGT;
+ break;
+ case ISD::SETULE:
+ case ISD::SETLE:
+ Cond = ISD::SETOLE;
+ break;
+ case ISD::SETUGE:
+ case ISD::SETGE:
+ Cond = ISD::SETOGE;
+ break;
+ case ISD::SETEQ:
+ Cond = ISD::SETOEQ;
+ break;
+ case ISD::SETNE:
+ Cond = ISD::SETONE;
+ break;
+ }
+
+ MVT FpVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
+ if (OkayBitcast) {
+ Op0 = DAG.getBitcast(FpVT, Op0);
+ Op1 = DAG.getBitcast(FpVT, Op1);
+ } else {
+ Op0 = DAG.getNode(ISD::SINT_TO_FP, dl, FpVT, Op0);
+ Op1 = DAG.getNode(ISD::SINT_TO_FP, dl, FpVT, Op1);
+ }
+ return DAG.getSetCC(dl, VT, Op0, Op1, Cond);
+ }
+ }
+
// Break 256-bit integer vector compare into smaller ones.
if (VT.is256BitVector() && !Subtarget.hasInt256())
return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index 6255621d870e12..d680135be56418 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -258,10 +258,7 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) {
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vcmpeqps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i8_8i32:
@@ -489,16 +486,10 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) {
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vcmpeqps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vcmpeqps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i16_16i32:
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index ee39b1333fff31..c7785c56972d8c 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -327,10 +327,9 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) {
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; AVX1-NEXT: vcmpeqps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -630,18 +629,16 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) {
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vcmpeqps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vpsrld $31, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
+; AVX1-NEXT: vcmpeqps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/cmpf-avx.ll b/llvm/test/CodeGen/X86/cmpf-avx.ll
new file mode 100644
index 00000000000000..f343e53b7fab83
--- /dev/null
+++ b/llvm/test/CodeGen/X86/cmpf-avx.ll
@@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64
+
+define <8 x i32> @cmp_eq_bitcast(<8 x i32> %x) {
+; X86-LABEL: cmp_eq_bitcast:
+; X86: # %bb.0:
+; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: cmp_eq_bitcast:
+; X64: # %bb.0:
+; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
+ %and = and <8 x i32> %x, <i32 7, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %cmp = icmp eq <8 x i32> %and, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_ne_sitofp(<8 x i32> %x) {
+; X86-LABEL: cmp_ne_sitofp:
+; X86: # %bb.0:
+; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X86-NEXT: vcmpneq_oqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: cmp_ne_sitofp:
+; X64: # %bb.0:
+; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X64-NEXT: vcmpneq_oqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
+ %cmp = icmp ne <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_slt_fail_no_const(<8 x i32> %x, <8 x i32> %y) {
+; X86-LABEL: cmp_slt_fail_no_const:
+; X86: # %bb.0:
+; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
+; X86-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; X86-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: cmp_slt_fail_no_const:
+; X64: # %bb.0:
+; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
+; X64-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-NEXT: retq
+ %and = and <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %cmp = icmp slt <8 x i32> %and, %y
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_eq_sitofp(<8 x i32> %x) {
+; X86-LABEL: cmp_eq_sitofp:
+; X86: # %bb.0:
+; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X86-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: cmp_eq_sitofp:
+; X64: # %bb.0:
+; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X64-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
+ %cmp = icmp eq <8 x i32> %x, <i32 -3, i32 -3, i32 -3, i32 -3, i32 -3, i32 -3, i32 -3, i32 -3>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_sgt_fail_no_bounds(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: cmp_sgt_fail_no_bounds:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3
+; CHECK-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %cmp = icmp slt <8 x i32> %x, %y
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_sgt_bitcast(<8 x i32> %xx, <8 x i32> %yy) {
+; CHECK-LABEL: cmp_sgt_bitcast:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastss {{.*#+}} ymm2 = [2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040]
+; CHECK-NEXT: vandps %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vandps %ymm2, %ymm1, %ymm1
+; CHECK-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %x = and <8 x i32> %xx, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+ %y = and <8 x i32> %yy, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+
+ %cmp = icmp sgt <8 x i32> %x, %y
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_sle_fail_out_of_bounds(<8 x i32> %xx) {
+; X86-LABEL: cmp_sle_fail_out_of_bounds:
+; X86: # %bb.0:
+; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [2139095041,2139095041,2139095041,2139095041]
+; X86-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
+; X86-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0
+; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: cmp_sle_fail_out_of_bounds:
+; X64: # %bb.0:
+; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [2139095041,2139095041,2139095041,2139095041]
+; X64-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
+; X64-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %x = and <8 x i32> %xx, <i32 2139095041, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+ %cmp = icmp sle <8 x i32> %x, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_eq_fail_out_of_bounds(<8 x i32> %x) {
+; CHECK-LABEL: cmp_eq_fail_out_of_bounds:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [16777216,16777216,16777216,16777216]
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %cmp = icmp eq <8 x i32> %x, <i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_eq_fail_out_of_bounds2(<8 x i32> %x) {
+; CHECK-LABEL: cmp_eq_fail_out_of_bounds2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [4278190080,4278190080,4278190080,4278190080]
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %cmp = icmp eq <8 x i32> %x, <i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_eq_todo(<8 x i32> %x) {
+; X86-LABEL: cmp_eq_todo:
+; X86: # %bb.0:
+; X86-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X86-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: cmp_eq_todo:
+; X64: # %bb.0:
+; X64-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %cmp = icmp eq <8 x i32> %x, <i32 -16777215, i32 16777215, i32 16777215, i32 -16777215, i32 16777215, i32 -16777215, i32 16777215, i32 -16777215>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_ult_fail_maybe_negative(<8 x i32> %x) {
+; CHECK-LABEL: cmp_ult_fail_maybe_negative:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2]
+; CHECK-NEXT: vpminud %xmm2, %xmm1, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
+; CHECK-NEXT: vpminud %xmm2, %xmm0, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %cmp = icmp ult <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_ule_bitcast(<8 x i32> %xx) {
+; X86-LABEL: cmp_ule_bitcast:
+; X86: # %bb.0:
+; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: cmp_ule_bitcast:
+; X64: # %bb.0:
+; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
+ %x = and <8 x i32> %xx, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+ %cmp = icmp ule <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_ugt_sitofp(<8 x i32> %xx) {
+; X86-LABEL: cmp_ugt_sitofp:
+; X86: # %bb.0:
+; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
+; X86-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: cmp_ugt_sitofp:
+; X64: # %bb.0:
+; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
+; X64-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %x = and <8 x i32> %xx, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+ %cmp = icmp ugt <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
diff --git a/llvm/test/CodeGen/X86/combine-testps.ll b/llvm/test/CodeGen/X86/combine-testps.ll
index 43dddbecf51a7d..66165ce2aa53a5 100644
--- a/llvm/test/CodeGen/X86/combine-testps.ll
+++ b/llvm/test/CodeGen/X86/combine-testps.ll
@@ -171,13 +171,24 @@ define i32 @testpsz_128_signbit(<4 x float> %c, <4 x float> %d, i32 %a, i32 %b)
}
define i32 @testpsnzc_256_signbit(<8 x float> %c, <8 x float> %d, i32 %a, i32 %b) {
-; CHECK-LABEL: testpsnzc_256_signbit:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: vtestps %ymm1, %ymm0
-; CHECK-NEXT: cmovnel %esi, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; AVX-LABEL: testpsnzc_256_signbit:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vcmpltps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vtestps %ymm1, %ymm0
+; AVX-NEXT: cmovnel %esi, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: testpsnzc_256_signbit:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: vtestps %ymm1, %ymm0
+; AVX2-NEXT: cmovnel %esi, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
%t0 = b...
[truncated]
|
NB: Proofs of the transform: https://alive2.llvm.org/ce/z/AJDdQ8 They timeout online, so need to be run locally. here is the output:
|
@@ -861,6 +861,9 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { | |||
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm3 | |||
; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0 | |||
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 | |||
; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 | |||
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 | |||
; AVX1-NEXT: vcmpltps %ymm3, %ymm1, %ymm1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure why this is happening. It seems we are determining the compare is dead before but not after.
Not sure what the best way to fix this is.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My guess is the PCMPGT handling in computeKnownBitsForTargetNode was managing to constant fold - but for some reason it never succeeded earlier?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Found it, its actually this fold in SimplifyDemandedBitsForTargetNode:
case X86ISD::PCMPGT:
// icmp sgt(0, R) == ashr(R, BitWidth-1).
// iff we only need the sign bit then we can use R directly.
if (OriginalDemandedBits.isSignMask() &&
ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
return TLO.CombineTo(Op, Op.getOperand(1));
break;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would think it would be folded earlier? Is there a place I could move this change too s.t it would be after all simplifications but still have reasonable access to KnownBits
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Think fixed. Added support for cmpp
in simplifydemandedbits.
@@ -1059,6 +1062,9 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { | |||
; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm5 | |||
; AVX1-NEXT: vpcmpgtd %xmm6, %xmm0, %xmm0 | |||
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 | |||
; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 | |||
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 | |||
; AVX1-NEXT: vcmpltps %ymm5, %ymm2, %ymm2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same as above
; AVX1-NEXT: vpcmpgtd %xmm7, %xmm1, %xmm1 | ||
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 | ||
; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm3 | ||
; AVX1-NEXT: vcmpltps %ymm5, %ymm3, %ymm3 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same as above.
} else { | ||
// For cvt + signed compare we need abs(lhs) or abs(rhs) < | ||
// MaxConvertableCvt | ||
if (OpInRange(1, /*CmpLT*/ true, MaxConvertableCvt) || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
&&
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
its ||
.
Proof is something like:
define i1 @src_eq(i32 %x, i32 %C) {
%C_abs = call i32 @llvm.abs.i32(i32 %C, i1 false)
%X_abs = call i32 @llvm.abs.i32(i32 %x, i1 false)
%C_lemma = icmp ult i32 %C_abs, 16777216
%X_lemma = icmp ult i32 %X_abs, 16777216
%lemma = or i1 %C_lemma, %X_lemma
call void @llvm.assume(i1 %lemma)
%cmp = icmp eq i32 %x, %C
ret i1 %cmp
}
define i1 @tgt_eq(i32 %x, i32 %C) {
%C_abs = call i32 @llvm.abs.i32(i32 %C, i1 false)
%X_abs = call i32 @llvm.abs.i32(i32 %x, i1 false)
%C_lemma = icmp ult i32 %C_abs, 16777216
%X_lemma = icmp ult i32 %X_abs, 16777216
%lemma = or i1 %C_lemma, %X_lemma
call void @llvm.assume(i1 %lemma)
%CFp = sitofp i32 %C to float
%conv = sitofp i32 %x to float
%cmp = fcmp oeq float %conv, %CFp
ret i1 %cmp
}
The condition:
%C_abs = call i32 @llvm.abs.i32(i32 %C, i1 false)
%X_abs = call i32 @llvm.abs.i32(i32 %x, i1 false)
%C_lemma = icmp ult i32 %C_abs, 16777216
%X_lemma = icmp ult i32 %X_abs, 16777216
%lemma = or i1 %C_lemma, %X_lemma
call void @llvm.assume(i1 %lemma)
is all ||
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Isn't a abs(x) < c
is c > x > -c
? Otherwise x
can be any possible value for a x < c || x > -c
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
note that the x > -c
is x u> -c
which is a pretty tight bound.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, I see. Comparing signed value with unsigned operator. It's a bit difficult for me to notice that :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure about the OkayBitcast codepath - I was just expecting some simple knownbits min/max value tests and SINT_TO_FP calls that would also handle constant folding.
@@ -23299,6 +23299,110 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, | |||
} | |||
} | |||
|
|||
// We get bad codegen for v8i32 compares on avx targets (without avx2) so if | |||
// possible convert to a v8f32 compare. | |||
if (VTOp0.getVectorElementType() == MVT::i32 && VTOp0 == MVT::v8i32 && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just test VTOp0 == MVT::v8i32 for ? VTOp0.getVectorElementType() == MVT::i32 looks unnecessary.
bool OkayCvt = false; | ||
bool OkayBitcast = false; | ||
|
||
// For cvt up to 1 << (Significand Precision) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it make sense to use the APFloat semantics instead of hard coded numbers here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not 100% sure, I reached the constants in the proofs via trial and error then realized these numbers are the most logical bound. Not certain though that they are indeed generalizable.
auto OpInRange = [&DAG, Op, &KnownOps](unsigned OpNo, bool CmpLT, | ||
const APInt Bound) { | ||
if (!KnownOps[OpNo].has_value()) | ||
KnownOps[OpNo] = DAG.computeKnownBits(Op.getOperand(OpNo)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do you want to cache KnownOps[] - shouldn't we just call both at the start and be done with it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can get away with only doing Op1
if its a small constant + no unsigned cond.
Whats wrong with the |
Sorry I meant I wasn't expecting it - and haven't gone through it properly yet. |
a508e9e
to
0194e79
Compare
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] | ||
; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 | ||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 | ||
; AVX1-NEXT: vmaskmovps %ymm0, %ymm1, 32(%rdx) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Anything we can do to prevent this regression?
@@ -861,6 +861,9 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { | |||
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm3 | |||
; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0 | |||
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 | |||
; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 | |||
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 | |||
; AVX1-NEXT: vcmpltps %ymm3, %ymm1, %ymm1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My guess is the PCMPGT handling in computeKnownBitsForTargetNode was managing to constant fold - but for some reason it never succeeded earlier?
0194e79
to
73b8c4a
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
73b8c4a
to
bf63272
Compare
@@ -2471,6 +2474,8 @@ bool TargetLowering::SimplifyDemandedBits( | |||
if (SimplifyDemandedBits(Src, InDemandedBits, InDemandedElts, Known, TLO, | |||
Depth + 1)) | |||
return true; | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks nothing changed in this file, should revert the whole file?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, thank you!
I recall I read someone commented with concern when DAZ is set, fcmp will always return equal for denormal values, but I cannot find the comment now. Is this a real concern? Did we fix it? |
Looks like that would be the result of DAZ, but didn't test: Ill update with |
Actually, I don't think this is an issue since we don't handle |
It affects all predicates, not only |
ah. We don't naturally have |
We should probably disable it regardless of the compiler settings because it'd be weird for FP flags to invalidate what looks like integer comparisons.... In my comment where I brought this up I posted a way to compare i32 for equality/inequality regardless of range, but it fails to generalize. I think your converting-to-float method is totally fine though. |
What do you mean by invalidate? The issue is simplifying a float signbit comparison to just return the float. DAZ can make the simplification incorrect. |
Maybe we're talking about different things then, sorry. I thought this was in relation to when we can or cannot bitcast integers to floats and do comparison as floats. For example, I think the restriction on C in |
Ah that is another issue... So there are two issues with DAZ:
Think both probably need to be fixed. Still re:
Do you mean drop the optimizations (if we can't prove normal)? Don't really see whats the issue with seeing if we are compiling with DAZ. |
fed3d38
to
351cefa
Compare
…83596) If we're logical shifting an all-signbits value, then we can just mask out the shifted bits. This helps removes some unnecessary bitcasted vXi16 shifts used for vXi8 shifts (which SimplifyDemandedBits will struggle to remove through the bitcast), and allows some AVX1 shifts of 256-bit values to stay as a YMM instruction. Noticed in codegen from #82290
rebase? |
351cefa
to
0da0ea1
Compare
Done. |
@@ -30,6 +30,7 @@ | |||
#include "llvm/Analysis/ObjCARCUtil.h" | |||
#include "llvm/Analysis/ProfileSummaryInfo.h" | |||
#include "llvm/Analysis/VectorUtils.h" | |||
#include "llvm/CodeGen/ISDOpcodes.h" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This shouldn't be necessary
} else if (auto *C = dyn_cast<ConstantSDNode>(Op1)) { | ||
Op1C = C->getAPIntValue(); | ||
} else if (ISD::isConstantSplatVector(Op1.getNode(), Op1C)) { | ||
// Pass |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pass? Make it clear that isConstantSplatVector sets Op1C
DAG, DL, ISD::SETGT, Op.getOperand(0), Op.getOperand(1), | ||
DemandedBits, /*AllowNOT*/ false, Depth)) | ||
return R; | ||
break; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is this case necessary? I thought it was just to convert a XMM X86ISD::CMPP back to an integer?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not, its a common helper to do the simplifications for float/int.
I could add float->int, however.
be07876
to
f8b168b
Compare
Res = KnownBits::ult(*KnownOps[OpNo], KnownBits::makeConstant(Bound)); | ||
else | ||
Res = KnownBits::ugt(*KnownOps[OpNo], KnownBits::makeConstant(Bound)); | ||
return Res.has_value() && *Res; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe return Res.value_or(false)
?
@@ -41125,6 +41255,154 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, | |||
return SDValue(); | |||
} | |||
|
|||
// Simplify a decomposed (sext (setcc)). Assumes prior check that | |||
// bitwidth(sext)==bitwidth(setcc operands). | |||
static SDValue simplifySExtOfDecomposedSetCCImpl( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any hope of pulling this out into its own patch?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will do
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@RKSimon seperate commit in this PR, or new PR as a whole?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
new PR if possible - patches that do too much at once make me nervous :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can do, the only issue is that basically its functionality is dependent on this pr. Otherwise we never really get the necessary floating point flags to assist with the simplifcation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So this PR will have some regressions in it, ill post the other PR as rebased ontop of this one.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, dropped the simplifcations from this patch. See: #84360 (based on this one).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note, there are some regressions in this patch now but they should be cleared up in #84360.
Also note, the reason #84360 is second is so that we can properly test the cmpp
code. Without this patch we never get the necessary FMF flags to actually do the cmpp
simplifications. If you think that is unnecessary I can swap the order (#84360 does stand on its own mostly for is simplifications on pcmpgt
).
f8b168b
to
adbc719
Compare
1094056
to
f6b0d17
Compare
f6b0d17
to
ba7d9d1
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any luck with the remaining regressions?
@@ -816,64 +816,6 @@ define float @test_v16f32(<16 x float> %a0) { | |||
; SSE41-NEXT: orps %xmm4, %xmm0 | |||
; SSE41-NEXT: retq | |||
; | |||
; AVX-LABEL: test_v16f32: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
still not fixed
Most of the regressions are fixed with #84360, the only outstanding one Thats tough to get around, as float -> int is much more difficult to |
Im trying to refactor so the code arbitrarily works to cvt int <-> fp compares. Then we can undo as needed. |
That doesn't really help. Maybe this isn't worth doing after all? |
Fixes: llvm#82242 The idea is that AVX doesn't support comparisons for `v8i32` so it splits the comparison into 2x `v4i32` comparisons + reconstruction of the `v8i32`. By converting to a float, we can handle the comparison with 1/2 instructions (1 if we can `bitcast`, 2 if we need to cast with `sitofp`). The Proofs: https://alive2.llvm.org/ce/z/AJDdQ8 Timeout, but they can be reproduced locally.
We currently only handle a single case for `pcmpgt`. This patch extends that to work for `cmpp` and handles comparitors more generically.
ba7d9d1
to
c67e83a
Compare
@RKSimon, thoughts on dropping this? Do you think its worth the complexity? |
Most of the regressions are caused by not simplifying |
// or rhs < MaxConvertableCvt | ||
|
||
if (OpInRange(1, /*CmpLT*/ true, APInt::getSignedMinValue(32)) && | ||
OpInRange(0, /*CmpLT*/ true, APInt::getSignedMinValue(32)) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@goldsteinn The comment says both lhs/rhs >= 0 but the comparisons are with MIN_INT ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
its unsigned compare. So foo u< MIN_INT
-> foo s>= 0
.
OpInRange(1, /*CmpLT*/ false, -MaxConvertableCvt) || | ||
OpInRange(0, /*CmpLT*/ true, MaxConvertableCvt) || | ||
OpInRange(0, /*CmpLT*/ false, -MaxConvertableCvt)) | ||
OkayCvt = true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't we need signed comparisons here - OpInRange just does unsigned comparisons? Ideally we'd use NumSignBits checks as an alternative to also check for being in signed bounds.
Fixes: #82242
The idea is that AVX doesn't support comparisons for
v8i32
so itsplits the comparison into 2x
v4i32
comparisons + reconstruction ofthe
v8i32
.By converting to a float, we can handle the comparison with 1/2
instructions (1 if we can
bitcast
, 2 if we need to cast withsitofp
).The Proofs: https://alive2.llvm.org/ce/z/AJDdQ8
Timeout, but they can be reproduced locally.