From 2af6457e115bd2bebd335416a936b2ae43de4851 Mon Sep 17 00:00:00 2001 From: Jerry Dang Date: Wed, 19 Nov 2025 20:57:26 -0500 Subject: [PATCH 1/2] [DAG] Add TRUNCATE_SSAT_S/U and TRUNCATE_USAT_U to canCreateUndefOrPoison (#152143) Saturating truncation operations are well-defined for all inputs and cannot create poison or undef values. This allows the optimizer to eliminate unnecessary freeze instructions after these operations. Fixes #152143 --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 3 + llvm/test/CodeGen/X86/truncate-sat-freeze.ll | 64 +++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 llvm/test/CodeGen/X86/truncate-sat-freeze.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 1b15a207a2d37..0f0174c8aea35 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5664,6 +5664,9 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::FP_EXTEND: case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: + case ISD::TRUNCATE_SSAT_U: + case ISD::TRUNCATE_SSAT_S: + case ISD::TRUNCATE_USAT_U: // No poison except from flags (which is handled above) return false; diff --git a/llvm/test/CodeGen/X86/truncate-sat-freeze.ll b/llvm/test/CodeGen/X86/truncate-sat-freeze.ll new file mode 100644 index 0000000000000..78aebe05ec1de --- /dev/null +++ b/llvm/test/CodeGen/X86/truncate-sat-freeze.ll @@ -0,0 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s + +; Test that freeze is eliminated for saturation truncate patterns. +; The freeze elimination happens at the IR level due to the IntrNoCreateUndefOrPoison +; attribute on the llvm.smax/smin/umin intrinsics. At the SelectionDAG level, +; TRUNCATE_SSAT_S/U and TRUNCATE_USAT_U operations are also marked in +; canCreateUndefOrPoison() to ensure consistency and enable potential future +; optimizations. This test validates the end-to-end behavior that no freeze +; instruction appears in the output. + +define <2 x i32> @trunc_ssat_s_freeze(<2 x i64> %a0) { +; CHECK-LABEL: trunc_ssat_s_freeze: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647] +; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: retq + %1 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a0, <2 x i64> ) + %2 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %1, <2 x i64> ) + %3 = trunc <2 x i64> %2 to <2 x i32> + %4 = freeze <2 x i32> %3 + ret <2 x i32> %4 +} + +define <2 x i32> @trunc_ssat_u_freeze(<2 x i64> %a0) { +; CHECK-LABEL: trunc_ssat_u_freeze: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] +; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: retq + %1 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a0, <2 x i64> zeroinitializer) + %2 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %1, <2 x i64> ) + %3 = trunc <2 x i64> %2 to <2 x i32> + %4 = freeze <2 x i32> %3 + ret <2 x i32> %4 +} + +define <2 x i32> @trunc_usat_u_freeze(<2 x i64> %a0) { +; CHECK-LABEL: trunc_usat_u_freeze: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: retq + %1 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %a0, <2 x i64> ) + %2 = trunc <2 x i64> %1 to <2 x i32> + %3 = freeze <2 x i32> %2 + ret <2 x i32> %3 +} + +declare <2 x i64> @llvm.smax.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.umin.v2i64(<2 x i64>, <2 x i64>) From 1aa7f89cb92af08d4a60c79dedff0025b30af8df Mon Sep 17 00:00:00 2001 From: Jerry Dang Date: Thu, 20 Nov 2025 22:15:35 -0500 Subject: [PATCH 2/2] Add tests in AArch64; Remove previous tests in X86 --- .../CodeGen/AArch64/truncate-sat-freeze.ll | 80 +++++++++++++++++++ llvm/test/CodeGen/X86/truncate-sat-freeze.ll | 64 --------------- 2 files changed, 80 insertions(+), 64 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/truncate-sat-freeze.ll delete mode 100644 llvm/test/CodeGen/X86/truncate-sat-freeze.ll diff --git a/llvm/test/CodeGen/AArch64/truncate-sat-freeze.ll b/llvm/test/CodeGen/AArch64/truncate-sat-freeze.ll new file mode 100644 index 0000000000000..97bf1bac2a7db --- /dev/null +++ b/llvm/test/CodeGen/AArch64/truncate-sat-freeze.ll @@ -0,0 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s + +; Test that saturating truncate operations work correctly with freeze. +; These intrinsics map to TRUNCATE_SSAT_S, TRUNCATE_SSAT_U, and TRUNCATE_USAT_U, +; which are marked in canCreateUndefOrPoison() as not creating poison. +; This allows freeze to be eliminated, enabling optimizations like select simplification. + +define <4 x i16> @sqxtn_with_freeze(<4 x i32> %a) { +; CHECK-LABEL: sqxtn_with_freeze: +; CHECK: // %bb.0: +; CHECK-NEXT: sqxtn v0.4h, v0.4s +; CHECK-NEXT: ret + %trunc = tail call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %a) + %freeze = freeze <4 x i16> %trunc + ret <4 x i16> %freeze +} + +define <4 x i16> @sqxtun_with_freeze(<4 x i32> %a) { +; CHECK-LABEL: sqxtun_with_freeze: +; CHECK: // %bb.0: +; CHECK-NEXT: sqxtun v0.4h, v0.4s +; CHECK-NEXT: ret + %trunc = tail call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %a) + %freeze = freeze <4 x i16> %trunc + ret <4 x i16> %freeze +} + +define <8 x i8> @uqxtn_with_freeze(<8 x i16> %a) { +; CHECK-LABEL: uqxtn_with_freeze: +; CHECK: // %bb.0: +; CHECK-NEXT: uqxtn v0.8b, v0.8h +; CHECK-NEXT: ret + %trunc = tail call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %a) + %freeze = freeze <8 x i8> %trunc + ret <8 x i8> %freeze +} + +; Test freeze elimination enables select simplification for sqxtn +define <4 x i16> @test_sqxtn_freeze_removal_select(<4 x i32> %a, i1 %cond) { +; CHECK-LABEL: test_sqxtn_freeze_removal_select: +; CHECK: // %bb.0: +; CHECK-NEXT: sqxtn v0.4h, v0.4s +; CHECK-NEXT: ret + %safe_a = freeze <4 x i32> %a + %val = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %safe_a) + %frozen_val = freeze <4 x i16> %val + %res = select i1 %cond, <4 x i16> %frozen_val, <4 x i16> %val + ret <4 x i16> %res +} + +; Test freeze elimination enables select simplification for sqxtun +define <4 x i16> @test_sqxtun_freeze_removal_select(<4 x i32> %a, i1 %cond) { +; CHECK-LABEL: test_sqxtun_freeze_removal_select: +; CHECK: // %bb.0: +; CHECK-NEXT: sqxtun v0.4h, v0.4s +; CHECK-NEXT: ret + %safe_a = freeze <4 x i32> %a + %val = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %safe_a) + %frozen_val = freeze <4 x i16> %val + %res = select i1 %cond, <4 x i16> %frozen_val, <4 x i16> %val + ret <4 x i16> %res +} + +; Test freeze elimination enables select simplification for uqxtn +define <8 x i8> @test_uqxtn_freeze_removal_select(<8 x i16> %a, i1 %cond) { +; CHECK-LABEL: test_uqxtn_freeze_removal_select: +; CHECK: // %bb.0: +; CHECK-NEXT: uqxtn v0.8b, v0.8h +; CHECK-NEXT: ret + %safe_a = freeze <8 x i16> %a + %val = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %safe_a) + %frozen_val = freeze <8 x i8> %val + %res = select i1 %cond, <8 x i8> %frozen_val, <8 x i8> %val + ret <8 x i8> %res +} + +declare <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32>) +declare <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32>) +declare <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16>) diff --git a/llvm/test/CodeGen/X86/truncate-sat-freeze.ll b/llvm/test/CodeGen/X86/truncate-sat-freeze.ll deleted file mode 100644 index 78aebe05ec1de..0000000000000 --- a/llvm/test/CodeGen/X86/truncate-sat-freeze.ll +++ /dev/null @@ -1,64 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s - -; Test that freeze is eliminated for saturation truncate patterns. -; The freeze elimination happens at the IR level due to the IntrNoCreateUndefOrPoison -; attribute on the llvm.smax/smin/umin intrinsics. At the SelectionDAG level, -; TRUNCATE_SSAT_S/U and TRUNCATE_USAT_U operations are also marked in -; canCreateUndefOrPoison() to ensure consistency and enable potential future -; optimizations. This test validates the end-to-end behavior that no freeze -; instruction appears in the output. - -define <2 x i32> @trunc_ssat_s_freeze(<2 x i64> %a0) { -; CHECK-LABEL: trunc_ssat_s_freeze: -; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647] -; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-NEXT: retq - %1 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a0, <2 x i64> ) - %2 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %1, <2 x i64> ) - %3 = trunc <2 x i64> %2 to <2 x i32> - %4 = freeze <2 x i32> %3 - ret <2 x i32> %4 -} - -define <2 x i32> @trunc_ssat_u_freeze(<2 x i64> %a0) { -; CHECK-LABEL: trunc_ssat_u_freeze: -; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4294967295,0,4294967295,0] -; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-NEXT: retq - %1 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a0, <2 x i64> zeroinitializer) - %2 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %1, <2 x i64> ) - %3 = trunc <2 x i64> %2 to <2 x i32> - %4 = freeze <2 x i32> %3 - ret <2 x i32> %4 -} - -define <2 x i32> @trunc_usat_u_freeze(<2 x i64> %a0) { -; CHECK-LABEL: trunc_usat_u_freeze: -; CHECK: # %bb.0: -; CHECK-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-NEXT: retq - %1 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %a0, <2 x i64> ) - %2 = trunc <2 x i64> %1 to <2 x i32> - %3 = freeze <2 x i32> %2 - ret <2 x i32> %3 -} - -declare <2 x i64> @llvm.smax.v2i64(<2 x i64>, <2 x i64>) -declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>) -declare <2 x i64> @llvm.umin.v2i64(<2 x i64>, <2 x i64>)