diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 40891efac3395..f26de051e0494 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7826,6 +7826,14 @@ class HorizontalReduction { if (V.isLoadCombineReductionCandidate(RdxKind)) break; + // For a poison-safe boolean logic reduction, do not replace select + // instructions with logic ops. All reduced values will be frozen (see + // below) to prevent leaking poison. + if (isa(ReductionRoot) && + isBoolLogicOp(cast(ReductionRoot)) && + NumReducedVals != ReduxWidth) + break; + V.computeMinimumValueSizes(); // Estimate cost. diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll index 479ad437ba1ac..02a15a36eaf61 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll @@ -97,16 +97,16 @@ define float @test_merge_anyof_v4sf(<4 x float> %t) { ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[T]], i32 0 ; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x float> [[T]] ; CHECK-NEXT: [[TMP4:%.*]] = fcmp olt <4 x float> [[T_FR]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i1> [[TMP4]] to i4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i4 [[TMP5]], 0 ; CHECK-NEXT: [[CMP19:%.*]] = fcmp ogt float [[TMP3]], 1.000000e+00 +; CHECK-NEXT: [[OR_COND3:%.*]] = select i1 [[TMP6]], i1 true, i1 [[CMP19]] ; CHECK-NEXT: [[CMP24:%.*]] = fcmp ogt float [[TMP2]], 1.000000e+00 +; CHECK-NEXT: [[OR_COND4:%.*]] = select i1 [[OR_COND3]], i1 true, i1 [[CMP24]] ; CHECK-NEXT: [[CMP29:%.*]] = fcmp ogt float [[TMP1]], 1.000000e+00 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i1> [[TMP4]] to i4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i4 [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[CMP19]] -; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[CMP24]] -; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP8]], [[CMP29]] +; CHECK-NEXT: [[OR_COND5:%.*]] = select i1 [[OR_COND4]], i1 true, i1 [[CMP29]] ; CHECK-NEXT: [[CMP34:%.*]] = fcmp ogt float [[TMP0]], 1.000000e+00 -; CHECK-NEXT: [[OR_COND6:%.*]] = select i1 [[TMP9]], i1 true, i1 [[CMP34]] +; CHECK-NEXT: [[OR_COND6:%.*]] = select i1 [[OR_COND5]], i1 true, i1 [[CMP34]] ; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[ADD]] ; CHECK-NEXT: ret float [[RETVAL_0]] @@ -269,16 +269,16 @@ define float @test_separate_anyof_v4sf(<4 x float> %t) { ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[T]], i32 0 ; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x float> [[T]] ; CHECK-NEXT: [[TMP4:%.*]] = fcmp olt <4 x float> [[T_FR]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i1> [[TMP4]] to i4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i4 [[TMP5]], 0 ; CHECK-NEXT: [[CMP18:%.*]] = fcmp ogt float [[TMP3]], 1.000000e+00 +; CHECK-NEXT: [[OR_COND3:%.*]] = select i1 [[TMP6]], i1 true, i1 [[CMP18]] ; CHECK-NEXT: [[CMP23:%.*]] = fcmp ogt float [[TMP2]], 1.000000e+00 +; CHECK-NEXT: [[OR_COND4:%.*]] = select i1 [[OR_COND3]], i1 true, i1 [[CMP23]] ; CHECK-NEXT: [[CMP28:%.*]] = fcmp ogt float [[TMP1]], 1.000000e+00 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i1> [[TMP4]] to i4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i4 [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[CMP18]] -; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[CMP23]] -; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP8]], [[CMP28]] +; CHECK-NEXT: [[OR_COND5:%.*]] = select i1 [[OR_COND4]], i1 true, i1 [[CMP28]] ; CHECK-NEXT: [[CMP33:%.*]] = fcmp ogt float [[TMP0]], 1.000000e+00 -; CHECK-NEXT: [[OR_COND6:%.*]] = select i1 [[TMP9]], i1 true, i1 [[CMP33]] +; CHECK-NEXT: [[OR_COND6:%.*]] = select i1 [[OR_COND5]], i1 true, i1 [[CMP33]] ; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[ADD]] ; CHECK-NEXT: ret float [[RETVAL_0]] @@ -436,16 +436,16 @@ define float @test_merge_anyof_v4si(<4 x i32> %t) { ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[T]], i32 0 ; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x i32> [[T]] ; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[T_FR]], +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i1> [[TMP4]] to i4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i4 [[TMP5]], 0 ; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[TMP3]], 255 +; CHECK-NEXT: [[OR_COND3:%.*]] = select i1 [[TMP6]], i1 true, i1 [[CMP11]] ; CHECK-NEXT: [[CMP14:%.*]] = icmp sgt i32 [[TMP2]], 255 +; CHECK-NEXT: [[OR_COND4:%.*]] = select i1 [[OR_COND3]], i1 true, i1 [[CMP14]] ; CHECK-NEXT: [[CMP17:%.*]] = icmp sgt i32 [[TMP1]], 255 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i1> [[TMP4]] to i4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i4 [[TMP5]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[CMP11]] -; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[CMP14]] -; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP8]], [[CMP17]] +; CHECK-NEXT: [[OR_COND5:%.*]] = select i1 [[OR_COND4]], i1 true, i1 [[CMP17]] ; CHECK-NEXT: [[CMP20:%.*]] = icmp sgt i32 [[TMP0]], 255 -; CHECK-NEXT: [[OR_COND6:%.*]] = select i1 [[TMP9]], i1 true, i1 [[CMP20]] +; CHECK-NEXT: [[OR_COND6:%.*]] = select i1 [[OR_COND5]], i1 true, i1 [[CMP20]] ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[ADD]] to float ; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[CONV]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll index 312217d4af963..46894009fca56 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -168,6 +168,10 @@ define i1 @mixed_logical_icmp(<4 x i32> %x) { ret i1 %s3 } +; TODO: This is better than all-scalar and still safe, +; but we want this to be 2 reductions with glue +; logic...or a wide reduction? + define i1 @logical_and_icmp_clamp(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_clamp( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3 @@ -181,10 +185,10 @@ define i1 @logical_and_icmp_clamp(<4 x i32> %x) { ; CHECK-NEXT: [[D3:%.*]] = icmp sgt i32 [[TMP1]], 17 ; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) -; CHECK-NEXT: [[TMP8:%.*]] = and i1 [[TMP7]], [[D0]] -; CHECK-NEXT: [[TMP9:%.*]] = and i1 [[TMP8]], [[D1]] -; CHECK-NEXT: [[TMP10:%.*]] = and i1 [[TMP9]], [[D2]] -; CHECK-NEXT: [[S7:%.*]] = select i1 [[TMP10]], i1 [[D3]], i1 false +; CHECK-NEXT: [[S4:%.*]] = select i1 [[TMP7]], i1 [[D0]], i1 false +; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false ; CHECK-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0