-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[AMDGPU] Change patterns for v_[pk_]add_{min|max} #164881
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Change patterns for v_[pk_]add_{min|max} #164881
Conversation
The intermediate result is in fact the add with saturation regardless of the clamp bit.
|
@llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) ChangesThe intermediate result is in fact the add with saturation Patch is 34.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/164881.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index ee1019079f885..05ba76ab489d8 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -976,10 +976,10 @@ def : GCNPat <
} // End SubtargetPredicate = HasLshlAddU64Inst
let SubtargetPredicate = HasAddMinMaxInsts in {
-def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>;
-def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>;
-def : ThreeOp_i32_Pats<add, smin, V_ADD_MIN_I32_e64>;
-def : ThreeOp_i32_Pats<add, umin, V_ADD_MIN_U32_e64>;
+def : ThreeOp_i32_Pats<saddsat, smax, V_ADD_MAX_I32_e64>;
+def : ThreeOp_i32_Pats<uaddsat, umax, V_ADD_MAX_U32_e64>;
+def : ThreeOp_i32_Pats<saddsat, smin, V_ADD_MIN_I32_e64>;
+def : ThreeOp_i32_Pats<uaddsat, umin, V_ADD_MIN_U32_e64>;
}
def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index c4692b71ca685..4ae2c1ed04dae 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -464,10 +464,10 @@ class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2,
>;
let SubtargetPredicate = HasPkAddMinMaxInsts in {
-def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>;
-def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>;
-def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>;
-def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>;
+def : ThreeOp_OpSelClampPats<saddsat, smax, V_PK_ADD_MAX_I16>;
+def : ThreeOp_OpSelClampPats<uaddsat, umax, V_PK_ADD_MAX_U16>;
+def : ThreeOp_OpSelClampPats<saddsat, smin, V_PK_ADD_MIN_I16>;
+def : ThreeOp_OpSelClampPats<uaddsat, umin, V_PK_ADD_MIN_U16>;
}
let SubtargetPredicate = HasPkMinMax3Insts in {
diff --git a/llvm/test/CodeGen/AMDGPU/add-max.ll b/llvm/test/CodeGen/AMDGPU/add-max.ll
index b3a70577b6979..c55137574a9a4 100644
--- a/llvm/test/CodeGen/AMDGPU/add-max.ll
+++ b/llvm/test/CodeGen/AMDGPU/add-max.ll
@@ -7,7 +7,7 @@ define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_max_u32 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
@@ -18,39 +18,38 @@ define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_max_u32 v0, s0, v0, v1
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
}
define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
-; SDAG-LABEL: add_max_u32_ssv:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_add_max_u32 v0, s0, s1, v0
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_u32_ssv:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_add_co_i32 s0, s0, s1
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT: v_max_u32_e32 v0, s0, v0
-; GISEL-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+; GCN-LABEL: add_max_u32_ssv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_max_u32 v0, s0, s1, v0
+; GCN-NEXT: ; return to shader part epilog
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
}
define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
-; GCN-LABEL: add_max_u32_sss:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_add_co_i32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT: s_max_u32 s0, s0, s2
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+; SDAG-LABEL: add_max_u32_sss:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT: v_max_u32_e32 v0, s2, v0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: add_max_u32_sss:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_add_max_u32 v0, s0, s1, v0
+; GISEL-NEXT: ; return to shader part epilog
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
@@ -61,7 +60,7 @@ define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_max_u32 v0, v0, s0, 4
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umax.i32(i32 %add, i32 4)
%ret = bitcast i32 %max to float
ret float %ret
@@ -72,26 +71,19 @@ define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_max_u32 v0, s0, v0, 0x64
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umax.i32(i32 %add, i32 100)
%ret = bitcast i32 %max to float
ret float %ret
}
-define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) {
-; SDAG-LABEL: add_max_u32_slv:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_add_max_u32 v0, 0x64, s0, v0
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_u32_slv:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_addk_co_i32 s0, 0x64
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT: v_max_u32_e32 v0, s0, v0
-; GISEL-NEXT: ; return to shader part epilog
- %add = add i32 %a, 100
- %max = call i32 @llvm.umax.i32(i32 %add, i32 %b)
+define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b, i32 %c) {
+; GCN-LABEL: add_max_u32_slv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_add_max_u32 v0, s0, v0, v1
+; GCN-NEXT: ; return to shader part epilog
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
+ %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
}
@@ -101,7 +93,7 @@ define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_max_i32 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.smax.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
@@ -112,7 +104,7 @@ define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_min_u32 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.umin.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
@@ -123,7 +115,7 @@ define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
; GCN: ; %bb.0:
; GCN-NEXT: v_add_min_i32 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add i32 %a, %b
+ %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
%max = call i32 @llvm.smin.i32(i32 %add, i32 %c)
%ret = bitcast i32 %max to float
ret float %ret
@@ -134,7 +126,7 @@ define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_u16 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -145,29 +137,18 @@ define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, v1
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
}
define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> %c) {
-; SDAG-LABEL: add_max_v2u16_ssv:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_pk_add_max_u16 v0, s0, s1, v0
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_v2u16_ssv:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_lshr_b32 s2, s0, 16
-; GISEL-NEXT: s_lshr_b32 s3, s1, 16
-; GISEL-NEXT: s_add_co_i32 s0, s0, s1
-; GISEL-NEXT: s_add_co_i32 s2, s2, s3
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GISEL-NEXT: v_pk_max_u16 v0, s0, v0
-; GISEL-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+; GCN-LABEL: add_max_v2u16_ssv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_max_u16 v0, s0, s1, v0
+; GCN-NEXT: ; return to shader part epilog
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -176,30 +157,18 @@ define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b
define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> inreg %c) {
; SDAG-LABEL: add_max_v2u16_sss:
; SDAG: ; %bb.0:
-; SDAG-NEXT: v_pk_add_u16 v0, s0, s1
+; SDAG-NEXT: v_pk_add_u16 v0, s0, s1 clamp
; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-NEXT: v_pk_max_u16 v0, v0, s2
; SDAG-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: add_max_v2u16_sss:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_lshr_b32 s3, s0, 16
-; GISEL-NEXT: s_lshr_b32 s4, s1, 16
-; GISEL-NEXT: s_add_co_i32 s0, s0, s1
-; GISEL-NEXT: s_add_co_i32 s3, s3, s4
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s3
-; GISEL-NEXT: s_and_b32 s3, s2, 0xffff
-; GISEL-NEXT: s_lshr_b32 s1, s0, 16
-; GISEL-NEXT: s_and_b32 s0, s0, 0xffff
-; GISEL-NEXT: s_lshr_b32 s2, s2, 16
-; GISEL-NEXT: s_max_u32 s0, s0, s3
-; GISEL-NEXT: s_max_u32 s1, s1, s2
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_pk_add_max_u16 v0, s0, s1, v0
; GISEL-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -210,7 +179,7 @@ define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) {
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_u16 v0, v0, s0, 4
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 4, i16 0>)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -221,28 +190,18 @@ define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) {
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, 0x650064
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 100, i16 101>)
%ret = bitcast <2 x i16> %max to float
ret float %ret
}
define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) {
-; SDAG-LABEL: add_max_v2u16_slv:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_pk_add_max_u16 v0, 0x640064, s0, v0
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_v2u16_slv:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_lshr_b32 s1, s0, 16
-; GISEL-NEXT: s_add_co_i32 s0, s0, 0x640064
-; GISEL-NEXT: s_addk_co_i32 s1, 0x64
-; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GISEL-NEXT: v_pk_max_u16 v0, s0, v0
-; GISEL-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, <i16 100, i16 100>
+; GCN-LABEL: add_max_v2u16_slv:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_pk_add_max_u16 v0, 0x640064, s0, v0
+; GCN-NEXT: ; return to shader part epilog
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> <i16 100, i16 100>)
%max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %b)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -253,7 +212,7 @@ define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_max_i16 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.smax.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -264,7 +223,7 @@ define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_min_u16 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.umin.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
@@ -275,7 +234,7 @@ define amdgpu_ps float @add_min_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_min_i16 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
- %add = add <2 x i16> %a, %b
+ %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
%max = call <2 x i16> @llvm.smin.v216(<2 x i16> %add, <2 x i16> %c)
%ret = bitcast <2 x i16> %max to float
ret float %ret
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 711d57baac15f..30ad46d959b7e 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -39131,21 +39131,21 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_xor_b32_e32 v2, v0, v1
; GFX1250-NEXT: v_cls_i32_e32 v3, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_add_nc_u32 v3, -1, v3 :: v_dual_bitop2_b32 v2, v0, v1 bitop3:0x14
; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_nc_u32_e32 v2, 32, v2
+; GFX1250-NEXT: v_min_u32_e32 v2, v3, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_min_u32 v2, v3, -1, v2
; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v2, v[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = sitofp i64 %x to bfloat
@@ -39483,29 +39483,30 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
; GFX1250-NEXT: v_xor_b32_e32 v4, v2, v3
; GFX1250-NEXT: v_cls_i32_e32 v6, v3
; GFX1250-NEXT: v_cls_i32_e32 v7, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_dual_ashrrev_i32 v5, 31, v5 :: v_dual_ashrrev_i32 v4, 31, v4
+; GFX1250-NEXT: v_dual_add_nc_u32 v6, -1, v6 :: v_dual_add_nc_u32 v7, -1, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_dual_add_nc_u32 v5, 32, v5 :: v_dual_add_nc_u32 v4, 32, v4
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_add_min_u32 v5, v7, -1, v5
-; GFX1250-NEXT: v_add_min_u32 v4, v6, -1, v4
+; GFX1250-NEXT: v_min_u32_e32 v5, v7, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_min_u32_e32 v4, v6, v4
; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v5, v[0:1]
-; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v4, v[2:3]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v4, v[2:3]
; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2
; GFX1250-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v4 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
; GFX1250-NEXT: v_sub_nc_u32_e32 v3, 32, v5
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_ldexp_f32 v0, v0, v3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_ldexp_f32 v1, v2, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = sitofp <2 x i64> %x to <2 x bfloat>
@@ -39968,41 +39969,42 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX1250TRUE16: ; %bb.0:
; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250TRUE16-NEXT: v_xor_b32_e32 v7, v2, v3
-; GFX1250TRUE16-NEXT: v_xor_b32_e32 v6, v4, v5
+; GFX1250TRUE16-NEXT: v_cls_i32_e32 v6, v5
+; GFX1250TRUE16-NEXT: v_xor_b32_e32 v7, v4, v5
; GFX1250TRUE16-NEXT: v_cls_i32_e32 v10, v3
-; GFX1250TRUE16-NEXT: v_cls_i32_e32 v9, v5
; GFX1250TRUE16-NEXT: v_cls_i32_e32 v11, v1
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT: v_dual_ashrrev_i32 v7, 31, v7 :: v_dual_ashrrev_i32 v6, 31, v6
-; GFX1250TRUE16-NEXT: v_xor_b32_e32 v8, v0, v1
-; GFX1250TRUE16-NEXT: v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v6, 32, v6
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT: v_ashrrev_i32_e32 v8, 31, v8
-; GFX1250TRUE16-NEXT: v_add_min_u32 v7, v10, -1, v7
-; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT: v_add_min_u32 v6, v9, -1, v6
-; GFX1250TRUE16-NEXT: v_lshlrev_b64_e32 v[2:3], v7, v[2:3]
-; GFX1250TRUE16-N...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Interesting, ISel works in the first place.
The intermediate result is in fact the add with saturation regardless of the clamp bit.
The intermediate result is in fact the add with saturation regardless of the clamp bit.
The intermediate result is in fact the add with saturation regardless of the clamp bit.

The intermediate result is in fact the add with saturation
regardless of the clamp bit.