llvm · YLChenZ · Aug 27, 2025 · Aug 28, 2025 · Aug 28, 2025
@@ -1933,13 +1933,18 @@ static Value *simplifyAndOrWithICmpEq(unsigned Opcode, Value *Op0, Value *Op1,
   // In the final case (Res == Absorber with inverted predicate), it is safe to
   // refine poison during simplification, but not undef. For simplicity always
   // disable undef-based folds here.
-  if (Value *Res = simplifyWithOpReplaced(Op1, A, B, Q.getWithoutUndef(),
-                                          /* AllowRefinement */ true,
-                                          /* DropFlags */ nullptr, MaxRecurse))
+  // Allow one extra recursion level for this speculative replace+simplify;
+  // because some folds require > MaxRecurse replacements to appear.
+  unsigned LocalMaxRecurse = MaxRecurse ? MaxRecurse + 1 : 1;
+  if (Value *Res =
+          simplifyWithOpReplaced(Op1, A, B, Q.getWithoutUndef(),
+                                 /* AllowRefinement */ true,
+                                 /* DropFlags */ nullptr, LocalMaxRecurse))
     return Simplify(Res);
-  if (Value *Res = simplifyWithOpReplaced(Op1, B, A, Q.getWithoutUndef(),
-                                          /* AllowRefinement */ true,
-                                          /* DropFlags */ nullptr, MaxRecurse))
+  if (Value *Res =
+          simplifyWithOpReplaced(Op1, B, A, Q.getWithoutUndef(),
+                                 /* AllowRefinement */ true,
+                                 /* DropFlags */ nullptr, LocalMaxRecurse))
     return Simplify(Res);
 
   return nullptr;

diff --git a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
@@ -6,26 +6,11 @@ declare i32 @llvm.amdgcn.sffbh.i32(i32) nounwind readnone speculatable
 define amdgpu_kernel void @select_constant_cttz(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
 ; GCN-LABEL: select_constant_cttz:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s4, 1, s2
-; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    s_ff1_i32_b32 s2, s4
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_and_b64 s[6:7], s[4:5], exec
-; GCN-NEXT:    s_cselect_b32 s2, -1, s2
-; GCN-NEXT:    s_flbit_i32 s6, s2
-; GCN-NEXT:    s_sub_i32 s8, 31, s6
-; GCN-NEXT:    s_cmp_eq_u32 s2, 0
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GCN-NEXT:    s_cselect_b32 s4, -1, s8
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %v    = load i32, ptr addrspace(1) %arrayidx, align 4
@@ -43,3 +28,4 @@ define amdgpu_kernel void @select_constant_cttz(ptr addrspace(1) noalias %out, p
 }
 
 !0 = !{i32 0, i32 33}
+
diff --git a/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll b/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll
@@ -347,3 +347,107 @@ define i1 @pr98753(i32 noundef %x, i32 %y) {
 }
 
 declare i1 @llvm.is.constant.i1(i1)
+
+
+define i1 @or_icmp_fold(i64 %arg0) {
+; CHECK-LABEL: @or_icmp_fold(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[ARG0:%.*]], 32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc nuw i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 55296
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -1114112
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], -1112064
+; CHECK-NEXT:    ret i1 [[TMP5]]
+;
+  %1 = lshr i64 %arg0, 32
+  %2 = trunc nuw i64 %1 to i32
+  %3 = xor i32 %2, 55296
+  %4 = add i32 %3, -1114112
+  %5 = icmp ult i32 %4, -1112064
+  %6 = icmp eq i64 %1, 1114112
+  %7 = or i1 %6, %5
+  ret i1 %7
+}
+
+
+define i1 @or_icmp_fold_negative(i64 %arg0) {
+; CHECK-LABEL: @or_icmp_fold_negative(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[ARG0:%.*]], 32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc nuw i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 55296
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -1114112
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1000
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP1]], 1114112
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    ret i1 [[TMP7]]
+;
+  %1 = lshr i64 %arg0, 32
+  %2 = trunc nuw i64 %1 to i32
+  %3 = xor i32 %2, 55296
+  %4 = add i32 %3, -1114112
+  %5 = icmp ult i32 %4, 1000
+  %6 = icmp eq i64 %1, 1114112
+  %7 = or i1 %6, %5
+  ret i1 %7
+}
+
+declare void @use(i32)
+
+define i1 @or_icmp_fold_multi_use(i64 %arg0) {
+; CHECK-LABEL: @or_icmp_fold_multi_use(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[ARG0:%.*]], 32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc nuw i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 55296
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -1114112
+; CHECK-NEXT:    call void @use(i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], -1112064
+; CHECK-NEXT:    ret i1 [[TMP5]]
+;
+  %1 = lshr i64 %arg0, 32
+  %2 = trunc nuw i64 %1 to i32
+  %3 = xor i32 %2, 55296
+  %4 = add i32 %3, -1114112
+  call void @use(i32 %4)
+  %5 = icmp ult i32 %4, -1112064
+  %6 = icmp eq i64 %1, 1114112
+  %7 = or i1 %6, %5
+  ret i1 %7
+}
+
+define i1 @or_icmp_fold_commuted(i64 %arg0) {
+; CHECK-LABEL: @or_icmp_fold_commuted(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[ARG0:%.*]], 32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc nuw i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], 55296
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -1114112
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], -1112064
+; CHECK-NEXT:    ret i1 [[TMP5]]
+;
+  %1 = lshr i64 %arg0, 32
+  %2 = trunc nuw i64 %1 to i32
+  %3 = xor i32 %2, 55296
+  %4 = add i32 %3, -1114112
+  %5 = icmp ult i32 %4, -1112064
+  %6 = icmp eq i64 %1, 1114112
+  %7 = or i1 %5, %6
+  ret i1 %7
+}
+
+
+define <2 x i1> @or_icmp_fold_vec(<2 x i64> %arg0) {
+; CHECK-LABEL: @or_icmp_fold_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i64> [[ARG0:%.*]], splat (i64 32)
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <2 x i32> [[TMP2]], splat (i32 55296)
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 -1114112)
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult <2 x i32> [[TMP4]], splat (i32 -1112064)
+; CHECK-NEXT:    ret <2 x i1> [[TMP5]]
+;
+  %1 = lshr <2 x i64> %arg0, <i64 32, i64 32>
+  %2 = trunc <2 x i64> %1 to <2 x i32>
+  %3 = xor <2 x i32> %2, <i32 55296, i32 55296>
+  %4 = add <2 x i32> %3, <i32 -1114112, i32 -1114112>
+  %5 = icmp ult <2 x i32> %4, <i32 -1112064, i32 -1112064>
+  %6 = icmp eq <2 x i64> %1, <i64 1114112, i64 1114112>
+  %7 = or <2 x i1> %6, %5
+  ret <2 x i1> %7
+}