Skip to content

Conversation

@rampitec
Copy link
Collaborator

The intermediate result is in fact the add with saturation
regardless of the clamp bit.

The intermediate result is in fact the add with saturation
regardless of the clamp bit.
Copy link
Collaborator Author

This stack of pull requests is managed by Graphite. Learn more about stacking.

@rampitec rampitec marked this pull request as ready for review October 23, 2025 20:00
@llvmbot
Copy link
Member

llvmbot commented Oct 23, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Stanislav Mekhanoshin (rampitec)

Changes

The intermediate result is in fact the add with saturation
regardless of the clamp bit.


Patch is 34.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/164881.diff

4 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+4-4)
  • (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+4-4)
  • (modified) llvm/test/CodeGen/AMDGPU/add-max.ll (+55-96)
  • (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+113-105)
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index ee1019079f885..05ba76ab489d8 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -976,10 +976,10 @@ def : GCNPat <
 } // End SubtargetPredicate = HasLshlAddU64Inst
 
 let SubtargetPredicate = HasAddMinMaxInsts in {
-def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>;
-def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>;
-def : ThreeOp_i32_Pats<add, smin, V_ADD_MIN_I32_e64>;
-def : ThreeOp_i32_Pats<add, umin, V_ADD_MIN_U32_e64>;
+def : ThreeOp_i32_Pats<saddsat, smax, V_ADD_MAX_I32_e64>;
+def : ThreeOp_i32_Pats<uaddsat, umax, V_ADD_MAX_U32_e64>;
+def : ThreeOp_i32_Pats<saddsat, smin, V_ADD_MIN_I32_e64>;
+def : ThreeOp_i32_Pats<uaddsat, umin, V_ADD_MIN_U32_e64>;
 }
 
 def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index c4692b71ca685..4ae2c1ed04dae 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -464,10 +464,10 @@ class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2,
 >;
 
 let SubtargetPredicate = HasPkAddMinMaxInsts in {
-def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>;
-def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>;
-def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>;
-def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>;
+def : ThreeOp_OpSelClampPats<saddsat, smax, V_PK_ADD_MAX_I16>;
+def : ThreeOp_OpSelClampPats<uaddsat, umax, V_PK_ADD_MAX_U16>;
+def : ThreeOp_OpSelClampPats<saddsat, smin, V_PK_ADD_MIN_I16>;
+def : ThreeOp_OpSelClampPats<uaddsat, umin, V_PK_ADD_MIN_U16>;
 }
 
 let SubtargetPredicate = HasPkMinMax3Insts in {
diff --git a/llvm/test/CodeGen/AMDGPU/add-max.ll b/llvm/test/CodeGen/AMDGPU/add-max.ll
index b3a70577b6979..c55137574a9a4 100644
--- a/llvm/test/CodeGen/AMDGPU/add-max.ll
+++ b/llvm/test/CodeGen/AMDGPU/add-max.ll
@@ -7,7 +7,7 @@ define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_max_u32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -18,39 +18,38 @@ define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_max_u32 v0, s0, v0, v1
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
 }
 
 define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
-; SDAG-LABEL: add_max_u32_ssv:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_add_max_u32 v0, s0, s1, v0
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_u32_ssv:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_add_co_i32 s0, s0, s1
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT:    v_max_u32_e32 v0, s0, v0
-; GISEL-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+; GCN-LABEL: add_max_u32_ssv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_add_max_u32 v0, s0, s1, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
 }
 
 define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
-; GCN-LABEL: add_max_u32_sss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_co_i32 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_max_u32 s0, s0, s2
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+; SDAG-LABEL: add_max_u32_sss:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_add_nc_u32_e64 v0, s0, s1 clamp
+; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT:    v_max_u32_e32 v0, s2, v0
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: add_max_u32_sss:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_add_max_u32 v0, s0, s1, v0
+; GISEL-NEXT:    ; return to shader part epilog
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -61,7 +60,7 @@ define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_max_u32 v0, v0, s0, 4
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 4)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -72,26 +71,19 @@ define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_max_u32 v0, s0, v0, 0x64
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 100)
   %ret = bitcast i32 %max to float
   ret float %ret
 }
 
-define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) {
-; SDAG-LABEL: add_max_u32_slv:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_add_max_u32 v0, 0x64, s0, v0
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_u32_slv:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_addk_co_i32 s0, 0x64
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT:    v_max_u32_e32 v0, s0, v0
-; GISEL-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, 100
-  %max = call i32 @llvm.umax.i32(i32 %add, i32 %b)
+define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b, i32 %c) {
+; GCN-LABEL: add_max_u32_slv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_add_max_u32 v0, s0, v0, v1
+; GCN-NEXT:    ; return to shader part epilog
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
+  %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
 }
@@ -101,7 +93,7 @@ define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_max_i32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.smax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -112,7 +104,7 @@ define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_min_u32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umin.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -123,7 +115,7 @@ define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_add_min_i32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.smin.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -134,7 +126,7 @@ define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_u16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -145,29 +137,18 @@ define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_u16 v0, s0, v0, v1
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
 }
 
 define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> %c) {
-; SDAG-LABEL: add_max_v2u16_ssv:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_pk_add_max_u16 v0, s0, s1, v0
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_v2u16_ssv:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_lshr_b32 s2, s0, 16
-; GISEL-NEXT:    s_lshr_b32 s3, s1, 16
-; GISEL-NEXT:    s_add_co_i32 s0, s0, s1
-; GISEL-NEXT:    s_add_co_i32 s2, s2, s3
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
-; GISEL-NEXT:    v_pk_max_u16 v0, s0, v0
-; GISEL-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+; GCN-LABEL: add_max_v2u16_ssv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_add_max_u16 v0, s0, s1, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -176,30 +157,18 @@ define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b
 define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> inreg %c) {
 ; SDAG-LABEL: add_max_v2u16_sss:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_pk_add_u16 v0, s0, s1
+; SDAG-NEXT:    v_pk_add_u16 v0, s0, s1 clamp
 ; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-NEXT:    v_pk_max_u16 v0, v0, s2
 ; SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GISEL-LABEL: add_max_v2u16_sss:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_lshr_b32 s3, s0, 16
-; GISEL-NEXT:    s_lshr_b32 s4, s1, 16
-; GISEL-NEXT:    s_add_co_i32 s0, s0, s1
-; GISEL-NEXT:    s_add_co_i32 s3, s3, s4
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GISEL-NEXT:    s_and_b32 s3, s2, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s1, s0, 16
-; GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s2, s2, 16
-; GISEL-NEXT:    s_max_u32 s0, s0, s3
-; GISEL-NEXT:    s_max_u32 s1, s1, s2
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_pk_add_max_u16 v0, s0, s1, v0
 ; GISEL-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -210,7 +179,7 @@ define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_u16 v0, v0, s0, 4
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 4, i16 0>)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -221,28 +190,18 @@ define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_u16 v0, s0, v0, 0x650064
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 100, i16 101>)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
 }
 
 define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) {
-; SDAG-LABEL: add_max_v2u16_slv:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_pk_add_max_u16 v0, 0x640064, s0, v0
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_v2u16_slv:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_lshr_b32 s1, s0, 16
-; GISEL-NEXT:    s_add_co_i32 s0, s0, 0x640064
-; GISEL-NEXT:    s_addk_co_i32 s1, 0x64
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GISEL-NEXT:    v_pk_max_u16 v0, s0, v0
-; GISEL-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, <i16 100, i16 100>
+; GCN-LABEL: add_max_v2u16_slv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_add_max_u16 v0, 0x640064, s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> <i16 100, i16 100>)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %b)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -253,7 +212,7 @@ define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_i16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.smax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -264,7 +223,7 @@ define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_min_u16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umin.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -275,7 +234,7 @@ define amdgpu_ps float @add_min_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_min_i16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.smin.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 711d57baac15f..30ad46d959b7e 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -39131,21 +39131,21 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_xor_b32_e32 v2, v0, v1
 ; GFX1250-NEXT:    v_cls_i32_e32 v3, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_add_nc_u32 v3, -1, v3 :: v_dual_bitop2_b32 v2, v0, v1 bitop3:0x14
 ; GFX1250-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_nc_u32_e32 v2, 32, v2
+; GFX1250-NEXT:    v_min_u32_e32 v2, v3, v2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_min_u32 v2, v3, -1, v2
 ; GFX1250-NEXT:    v_lshlrev_b64_e32 v[0:1], v2, v[0:1]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT:    v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %op = sitofp i64 %x to bfloat
@@ -39483,29 +39483,30 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX1250-NEXT:    v_xor_b32_e32 v4, v2, v3
 ; GFX1250-NEXT:    v_cls_i32_e32 v6, v3
 ; GFX1250-NEXT:    v_cls_i32_e32 v7, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_dual_ashrrev_i32 v5, 31, v5 :: v_dual_ashrrev_i32 v4, 31, v4
+; GFX1250-NEXT:    v_dual_add_nc_u32 v6, -1, v6 :: v_dual_add_nc_u32 v7, -1, v7
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_add_nc_u32 v5, 32, v5 :: v_dual_add_nc_u32 v4, 32, v4
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_add_min_u32 v5, v7, -1, v5
-; GFX1250-NEXT:    v_add_min_u32 v4, v6, -1, v4
+; GFX1250-NEXT:    v_min_u32_e32 v5, v7, v5
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_min_u32_e32 v4, v6, v4
 ; GFX1250-NEXT:    v_lshlrev_b64_e32 v[0:1], v5, v[0:1]
-; GFX1250-NEXT:    v_lshlrev_b64_e32 v[2:3], v4, v[2:3]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_lshlrev_b64_e32 v[2:3], v4, v[2:3]
 ; GFX1250-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX1250-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_dual_sub_nc_u32 v1, 32, v4 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
 ; GFX1250-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %op = sitofp <2 x i64> %x to <2 x bfloat>
@@ -39968,41 +39969,42 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX1250TRUE16:       ; %bb.0:
 ; GFX1250TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250TRUE16-NEXT:    v_xor_b32_e32 v7, v2, v3
-; GFX1250TRUE16-NEXT:    v_xor_b32_e32 v6, v4, v5
+; GFX1250TRUE16-NEXT:    v_cls_i32_e32 v6, v5
+; GFX1250TRUE16-NEXT:    v_xor_b32_e32 v7, v4, v5
 ; GFX1250TRUE16-NEXT:    v_cls_i32_e32 v10, v3
-; GFX1250TRUE16-NEXT:    v_cls_i32_e32 v9, v5
 ; GFX1250TRUE16-NEXT:    v_cls_i32_e32 v11, v1
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT:    v_dual_ashrrev_i32 v7, 31, v7 :: v_dual_ashrrev_i32 v6, 31, v6
-; GFX1250TRUE16-NEXT:    v_xor_b32_e32 v8, v0, v1
-; GFX1250TRUE16-NEXT:    v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v6, 32, v6
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; GFX1250TRUE16-NEXT:    v_add_min_u32 v7, v10, -1, v7
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT:    v_add_min_u32 v6, v9, -1, v6
-; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[2:3], v7, v[2:3]
-; GFX1250TRUE16-N...
[truncated]

Copy link
Contributor

@shiltian shiltian left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting, ISel works in the first place.

@rampitec rampitec merged commit ef923f1 into main Oct 23, 2025
14 checks passed
@rampitec rampitec deleted the users/rampitec/10-23-_amdgpu_change_patterns_for_v__pk__add__min_max_ branch October 23, 2025 22:45
dvbuka pushed a commit to dvbuka/llvm-project that referenced this pull request Oct 27, 2025
The intermediate result is in fact the add with saturation
regardless of the clamp bit.
Lukacma pushed a commit to Lukacma/llvm-project that referenced this pull request Oct 29, 2025
The intermediate result is in fact the add with saturation
regardless of the clamp bit.
aokblast pushed a commit to aokblast/llvm-project that referenced this pull request Oct 30, 2025
The intermediate result is in fact the add with saturation
regardless of the clamp bit.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants