diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 5907e21065331..06e952e38d824 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -1933,13 +1933,18 @@ static Value *simplifyAndOrWithICmpEq(unsigned Opcode, Value *Op0, Value *Op1, // In the final case (Res == Absorber with inverted predicate), it is safe to // refine poison during simplification, but not undef. For simplicity always // disable undef-based folds here. - if (Value *Res = simplifyWithOpReplaced(Op1, A, B, Q.getWithoutUndef(), - /* AllowRefinement */ true, - /* DropFlags */ nullptr, MaxRecurse)) + // Allow one extra recursion level for this speculative replace+simplify; + // because some folds require > MaxRecurse replacements to appear. + unsigned LocalMaxRecurse = MaxRecurse ? MaxRecurse + 1 : 1; + if (Value *Res = + simplifyWithOpReplaced(Op1, A, B, Q.getWithoutUndef(), + /* AllowRefinement */ true, + /* DropFlags */ nullptr, LocalMaxRecurse)) return Simplify(Res); - if (Value *Res = simplifyWithOpReplaced(Op1, B, A, Q.getWithoutUndef(), - /* AllowRefinement */ true, - /* DropFlags */ nullptr, MaxRecurse)) + if (Value *Res = + simplifyWithOpReplaced(Op1, B, A, Q.getWithoutUndef(), + /* AllowRefinement */ true, + /* DropFlags */ nullptr, LocalMaxRecurse)) return Simplify(Res); return nullptr; diff --git a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll index 9896e5f4c8cae..78803540dd1ae 100644 --- a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll @@ -6,26 +6,11 @@ declare i32 @llvm.amdgcn.sffbh.i32(i32) nounwind readnone speculatable define amdgpu_kernel void @select_constant_cttz(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GCN-LABEL: select_constant_cttz: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s4, 1, s2 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_ff1_i32_b32 s2, s4 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_and_b64 s[6:7], s[4:5], exec -; GCN-NEXT: s_cselect_b32 s2, -1, s2 -; GCN-NEXT: s_flbit_i32 s6, s2 -; GCN-NEXT: s_sub_i32 s8, 31, s6 -; GCN-NEXT: s_cmp_eq_u32 s2, 0 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN-NEXT: s_cselect_b32 s4, -1, s8 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %v = load i32, ptr addrspace(1) %arrayidx, align 4 @@ -43,3 +28,4 @@ define amdgpu_kernel void @select_constant_cttz(ptr addrspace(1) noalias %out, p } !0 = !{i32 0, i32 33} + diff --git a/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll b/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll index 99e1dd4528697..aeae8a9880929 100644 --- a/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll +++ b/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll @@ -347,3 +347,107 @@ define i1 @pr98753(i32 noundef %x, i32 %y) { } declare i1 @llvm.is.constant.i1(i1) + + +define i1 @or_icmp_fold(i64 %arg0) { +; CHECK-LABEL: @or_icmp_fold( +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[ARG0:%.*]], 32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc nuw i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 55296 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], -1114112 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], -1112064 +; CHECK-NEXT: ret i1 [[TMP5]] +; + %1 = lshr i64 %arg0, 32 + %2 = trunc nuw i64 %1 to i32 + %3 = xor i32 %2, 55296 + %4 = add i32 %3, -1114112 + %5 = icmp ult i32 %4, -1112064 + %6 = icmp eq i64 %1, 1114112 + %7 = or i1 %6, %5 + ret i1 %7 +} + + +define i1 @or_icmp_fold_negative(i64 %arg0) { +; CHECK-LABEL: @or_icmp_fold_negative( +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[ARG0:%.*]], 32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc nuw i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 55296 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], -1114112 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1000 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP1]], 1114112 +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP5]] +; CHECK-NEXT: ret i1 [[TMP7]] +; + %1 = lshr i64 %arg0, 32 + %2 = trunc nuw i64 %1 to i32 + %3 = xor i32 %2, 55296 + %4 = add i32 %3, -1114112 + %5 = icmp ult i32 %4, 1000 + %6 = icmp eq i64 %1, 1114112 + %7 = or i1 %6, %5 + ret i1 %7 +} + +declare void @use(i32) + +define i1 @or_icmp_fold_multi_use(i64 %arg0) { +; CHECK-LABEL: @or_icmp_fold_multi_use( +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[ARG0:%.*]], 32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc nuw i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 55296 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], -1114112 +; CHECK-NEXT: call void @use(i32 [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], -1112064 +; CHECK-NEXT: ret i1 [[TMP5]] +; + %1 = lshr i64 %arg0, 32 + %2 = trunc nuw i64 %1 to i32 + %3 = xor i32 %2, 55296 + %4 = add i32 %3, -1114112 + call void @use(i32 %4) + %5 = icmp ult i32 %4, -1112064 + %6 = icmp eq i64 %1, 1114112 + %7 = or i1 %6, %5 + ret i1 %7 +} + +define i1 @or_icmp_fold_commuted(i64 %arg0) { +; CHECK-LABEL: @or_icmp_fold_commuted( +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[ARG0:%.*]], 32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc nuw i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 55296 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], -1114112 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], -1112064 +; CHECK-NEXT: ret i1 [[TMP5]] +; + %1 = lshr i64 %arg0, 32 + %2 = trunc nuw i64 %1 to i32 + %3 = xor i32 %2, 55296 + %4 = add i32 %3, -1114112 + %5 = icmp ult i32 %4, -1112064 + %6 = icmp eq i64 %1, 1114112 + %7 = or i1 %5, %6 + ret i1 %7 +} + + +define <2 x i1> @or_icmp_fold_vec(<2 x i64> %arg0) { +; CHECK-LABEL: @or_icmp_fold_vec( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i64> [[ARG0:%.*]], splat (i64 32) +; CHECK-NEXT: [[TMP2:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[TMP2]], splat (i32 55296) +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 -1114112) +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult <2 x i32> [[TMP4]], splat (i32 -1112064) +; CHECK-NEXT: ret <2 x i1> [[TMP5]] +; + %1 = lshr <2 x i64> %arg0, + %2 = trunc <2 x i64> %1 to <2 x i32> + %3 = xor <2 x i32> %2, + %4 = add <2 x i32> %3, + %5 = icmp ult <2 x i32> %4, + %6 = icmp eq <2 x i64> %1, + %7 = or <2 x i1> %6, %5 + ret <2 x i1> %7 +}