-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[PeepholeOptimizer] Recognize new move-immediate instructions #72128
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Jay Foad (jayfoad) ChangesFolding a move-immediate into another move can create a new For the AMDGPU target this happens with sequences like: s_mov_b32 s0, 12345 The second instruction will be folded to: v_mov_b32 v0, 12345 With this patch, the immediate value 12345 can then be folded into Patch is 146.07 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/72128.diff 10 Files Affected:
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index 76b3b16af16bdc7..5914450162c8e18 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -1811,6 +1811,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
LocalMIs.erase(MI);
continue;
}
+ isMoveImmediate(*MI, ImmDefRegs, ImmDefMIs);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 3eb6f1eced0957f..7d3e0208cdb9dbe 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -33,12 +33,12 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1
; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v2
; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc
-; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v6
+; CHECK-NEXT: v_madmk_f32 v3, v6, 0x4f800000, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3
; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3
; CHECK-NEXT: v_trunc_f32_e32 v8, v6
-; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8
+; CHECK-NEXT: v_madmk_f32 v3, v8, 0xcf800000, v3
; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v3
; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v8
; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0
@@ -215,13 +215,13 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: s_mov_b32 s7, s6
; CHECK-NEXT: s_xor_b64 s[12:13], s[0:1], s[6:7]
; CHECK-NEXT: s_sub_u32 s3, 0, s10
-; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
+; CHECK-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
; CHECK-NEXT: s_subb_u32 s5, 0, s11
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; CHECK-NEXT: v_trunc_f32_e32 v2, v1
-; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
+; CHECK-NEXT: v_madmk_f32 v0, v2, 0xcf800000, v0
; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0
; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
@@ -669,12 +669,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1
; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2
; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc
-; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4
+; CGP-NEXT: v_madmk_f32 v3, v4, 0x4f800000, v3
; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3
; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
; CGP-NEXT: v_trunc_f32_e32 v5, v4
-; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
+; CGP-NEXT: v_madmk_f32 v3, v5, 0xcf800000, v3
; CGP-NEXT: v_cvt_u32_f32_e32 v12, v3
; CGP-NEXT: v_cvt_u32_f32_e32 v15, v5
; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
@@ -842,12 +842,12 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v4
; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc
-; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
+; CGP-NEXT: v_madmk_f32 v5, v6, 0x4f800000, v5
; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5
; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
; CGP-NEXT: v_trunc_f32_e32 v7, v6
-; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7
+; CGP-NEXT: v_madmk_f32 v5, v7, 0xcf800000, v5
; CGP-NEXT: v_cvt_u32_f32_e32 v10, v5
; CGP-NEXT: v_cvt_u32_f32_e32 v13, v7
; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
@@ -1002,12 +1002,12 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000
; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0
; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000
-; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
+; CHECK-NEXT: v_madmk_f32 v2, v3, 0x4f800000, v2
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v4, v3
-; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
+; CHECK-NEXT: v_madmk_f32 v2, v4, 0xcf800000, v2
; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2
; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4
; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
@@ -1607,12 +1607,12 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb
; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0
; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705
-; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
+; CHECK-NEXT: v_madmk_f32 v2, v3, 0x4f800000, v2
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v4, v3
-; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
+; CHECK-NEXT: v_madmk_f32 v2, v4, 0xcf800000, v2
; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2
; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4
; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
@@ -2237,12 +2237,12 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1
; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v2
; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc
-; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
+; CHECK-NEXT: v_madmk_f32 v5, v6, 0x4f800000, v5
; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5
; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
; CHECK-NEXT: v_trunc_f32_e32 v7, v6
-; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7
+; CHECK-NEXT: v_madmk_f32 v5, v7, 0xcf800000, v5
; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v5
; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7
; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
@@ -2693,12 +2693,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cvt_f32_u32_e32 v11, v1
; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4
; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc
-; CGP-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11
+; CGP-NEXT: v_madmk_f32 v10, v11, 0x4f800000, v10
; CGP-NEXT: v_rcp_iflag_f32_e32 v10, v10
; CGP-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10
; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10
; CGP-NEXT: v_trunc_f32_e32 v12, v11
-; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12
+; CGP-NEXT: v_madmk_f32 v10, v12, 0xcf800000, v10
; CGP-NEXT: v_cvt_u32_f32_e32 v13, v10
; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12
; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
@@ -2868,12 +2868,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3
; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v4
; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc
-; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8
+; CGP-NEXT: v_madmk_f32 v6, v8, 0x4f800000, v6
; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6
; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6
; CGP-NEXT: v_trunc_f32_e32 v10, v8
-; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10
+; CGP-NEXT: v_madmk_f32 v6, v10, 0xcf800000, v6
; CGP-NEXT: v_cvt_u32_f32_e32 v11, v6
; CGP-NEXT: v_cvt_u32_f32_e32 v14, v10
; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 0b22b3b3a4ba7c6..fc904cda5d279ce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -33,12 +33,12 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v1
; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc
-; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
+; CHECK-NEXT: v_madmk_f32 v2, v3, 0x4f800000, v2
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v6, v3
-; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6
+; CHECK-NEXT: v_madmk_f32 v2, v6, 0xcf800000, v2
; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v6
; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
@@ -209,13 +209,13 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: s_mov_b32 s7, s6
; CHECK-NEXT: s_xor_b64 s[10:11], s[10:11], s[6:7]
; CHECK-NEXT: s_sub_u32 s3, 0, s8
-; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
+; CHECK-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
; CHECK-NEXT: s_subb_u32 s5, 0, s9
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; CHECK-NEXT: v_trunc_f32_e32 v2, v1
-; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
+; CHECK-NEXT: v_madmk_f32 v0, v2, 0xcf800000, v0
; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0
; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
@@ -655,12 +655,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1
; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v0
; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc
-; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
+; CGP-NEXT: v_madmk_f32 v2, v3, 0x4f800000, v2
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CGP-NEXT: v_trunc_f32_e32 v4, v3
-; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
+; CGP-NEXT: v_madmk_f32 v2, v4, 0xcf800000, v2
; CGP-NEXT: v_cvt_u32_f32_e32 v5, v2
; CGP-NEXT: v_cvt_u32_f32_e32 v14, v4
; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
@@ -824,12 +824,12 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3
; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v2
; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc
-; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
+; CGP-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; CGP-NEXT: v_trunc_f32_e32 v6, v5
-; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6
+; CGP-NEXT: v_madmk_f32 v4, v6, 0xcf800000, v4
; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4
; CGP-NEXT: v_cvt_u32_f32_e32 v12, v6
; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0
@@ -980,12 +980,12 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000
; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0
; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000
-; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
+; CHECK-NEXT: v_madmk_f32 v2, v3, 0x4f800000, v2
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v4, v3
-; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
+; CHECK-NEXT: v_madmk_f32 v2, v4, 0xcf800000, v2
; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2
; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4
; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
@@ -1575,12 +1575,12 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb
; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0
; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705
-; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
+; CHECK-NEXT: v_madmk_f32 v2, v3, 0x4f800000, v2
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v4, v3
-; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
+; CHECK-NEXT: v_madmk_f32 v2, v4, 0xcf800000, v2
; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2
; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4
; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
@@ -2195,12 +2195,12 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v1
; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc
-; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5
+; CHECK-NEXT: v_madmk_f32 v2, v5, 0x4f800000, v2
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v7, v5
-; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7
+; CHECK-NEXT: v_madmk_f32 v2, v7, 0xcf800000, v2
; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7
; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
@@ -2645,12 +2645,12 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cvt_f32_u32_e32 v10, v1
; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v0
; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc
-; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10
+; CGP-NEXT: v_madmk_f32 v4, v10, 0x4f800000, v4
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4
; CGP-NEXT: v_trunc_f32_e32 v12, v10
-; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12
+; CGP-NEXT: v_madmk_f32 v4, v12, 0xcf800000, v4
; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4
; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12
; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
@@ -2819,12 +2819,12 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v2
; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc
-; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6
+; CGP-NEXT: v_madmk_f32 v4, v6, 0x4f800000, v4
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4
; CGP-NEXT: v_trunc_f32_e32 v6, v6
-; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6
+; CGP-NEXT: v_madmk_f32 v4, v6, 0xcf800000, v4
; CGP-NEXT: v_cvt_u32_f32_e32 v11, v4
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 3add708d1a6394d..342d04141ced45e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -28,12 +28,12 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) {
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v3
; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc
-; CHECK-NEXT: v_mac_f32_e32 v6, 0x4f800000, v0
-; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v6
+; CHECK-NEXT: v_madmk_f32 v0, v0, 0x4f800000, v6
+; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v0
; CHECK-NEXT: v_trunc_f32_e32 v6, v6
-; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6
+; CHECK-NEXT: v_madmk_f32 v0, v6, 0xcf800000, v0
; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
; CHECK-NEXT: v_mul_lo_u32 v8, v1, v6
@@ -205,7 +205,7 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v1
; CHECK-NEXT: v_trunc_f32_e32 v4, v4
-; CHECK-NEXT: v_mac_f32_e32 v1, 0xcf800000, v4
+; CHECK-NEXT: v_madmk_f32 v1, v4, 0xcf800000, v1
; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4
; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1
; CHECK-NEXT: v_mul_lo_u32 v5, s4, v4
@@ -636,12 +636,12 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_cvt_f32_u32_e32 v0, v5
; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4
; CGP-NEXT: v_subb_u32_e32 v3, vcc, 0, v5, vcc
-; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v0
-; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v2
+; CGP-NEXT: v_madmk_f32 v0, v0, 0x4f800000, v2
+; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0
; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; CGP-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
; CGP-NEXT: v_trunc_f32_e32 v2, v2
-; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
+; CGP-NEXT: v_madmk_f32 v0, v2, 0xcf800000, v0
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0
; CGP-NEXT: v_mul_lo_u32 v12, v1, v2
@@ -803,12 +803,12 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v7
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6
; CGP-NEXT: v_subb_u32_e32 v5, vcc, 0, v7, vcc
-; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v2
-; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4
+; CGP-NEXT: v_madmk_f32 v2, v2, 0x4f800000, v4
+; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2
; CGP-NEXT: v_trunc_f32_e32 v4, v4
-; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
+; CGP-NEXT: v_madmk_f32 v2, v4, 0xcf800000, v2
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
; CGP-NEXT: v_mul_lo_u32 v10, v3, v4
@@ -1091,12 +1091,12 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v6
; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5
; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v6, vcc
-; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v0
-; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v2
+; CHECK-NEXT: v_madmk_f32 v0, v0, 0x4f800000, v2
+; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
; CHECK-NEXT: v_trunc_f32_e32 v2, v2
-; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
+; CHECK-NEXT: v_madmk_f32 v0, v2, 0xcf800000, v0
; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
; CHECK-NEXT: v_mul_lo_u32 v8, v1, v2
@@ -1526,12 +1526,12 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cvt_f32_u32_e32 v0, v3
; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc
-; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v0
-; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v4
+; CGP-NEXT: v_madmk_f32 v0, v0, 0x4f800000, v4
+; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0
; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0
; CGP-NEXT: v_trunc_f32_e32 v4, v4
-; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v4
+; CGP-NEXT: v_madmk_f32 v0, v4, 0xcf800000, v0
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0
; CGP-NEXT: v_mul_lo_u32 v13, v1, v4
@@ -1695,12 +1695,12 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9
; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v10, vcc
-; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v2
-; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4
+; CGP-NEXT: v_madmk_f32 v2, v2, 0x4f800000, v4
+; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2
; CGP-NEXT: v_trunc_f32_e32 v4, v4
-; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
+; CGP-NEXT: v_madmk_f32 v2, v4, 0xcf800000, v2
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
; CGP-NEXT: v_mul_lo_u32 v8, v3, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 12df4b7c7fc33d7..1d6fc87e7989efd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -28,12 +28,12 @@ define i64 @v_urem_i64(i64 %num, i64 %den) {
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v3
; CHECK-NEXT:...
[truncated]
|
Folding a move-immediate into another move can create a new move-immediate. Recognize these and use them as sources for further folding. For the AMDGPU target this happens with sequences like: s_mov_b32 s0, 12345 v_mov_b32 v0, s0 The second instruction will be folded to: v_mov_b32 v0, 12345 With this patch, the immediate value 12345 can then be folded into further uses of v0.
TBH I don't understand why this does not affect other (non AMDGPU) targets. Even for AMDGPU, I compiled a bunch of graphics shaders with -mcpu=gfx1100 and saw no difference from this patch. Perhaps other targets never create sequences where a move-immediate feeds into another move? Perhaps AMDGPU should not create them either? |
; GFX10-SDAG-LABEL: s_mul_fma_32_f32: | ||
; GFX10-SDAG: ; %bb.0: | ||
; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s1 | ||
; GFX10-SDAG-NEXT: v_fmac_f32_e64 v0, 0x42000000, s0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why isn't this one turned into a v_fmamk?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great spot! #72258
For the most part, we shouldn't. I think in general this is the class of problem that RegBankSelect or follow up optimizations should be avoiding before selection. It might make sense to preserve this pattern in rare situations with multiple uses of the constant |
<AMDGPU-specific> OK, makes sense, but then we end up with things like this: v_mov_b32 v1, 0x12345678 On GFX10+, the best thing SIFoldOperands could do with this is to fold one of the constants in as a literal, and the other one as an sgpr: s_mov_b32 s2, 0x23456789 But currently it has no way to "go backwards" from a constant in a vgpr to the same constant in an sgpr. </AMDGPU-specific> |
I think SIFoldOperands works backwards from how it should, going from uses to defs instead of defs to uses. It would be better if it collected seen foldable instructions, more like how PeepholeOpt does it. If we consistently have constants materialized in VGPRs (as should happen in GlobalIsel), I think it's simpler to introduce new SGPRs where appropriate rather than needing to handle existing partially folded operands as it does now. |
I know - you have mentioned it once or twice :) I think https://reviews.llvm.org/D114643 might have removed a (the?) barrier to doing this, since the pass no longer cares how many uses each constant has. As for the implementation, I'm not sure it needs to "collect seen foldable instructions". We're in SSA form so can't it just do:
Then the only complexity is if multiple operands of the same MI are foldable, you might want to carefully choose which ones to fold. |
Folding a move-immediate into another move can create a new
move-immediate. Recognize these and use them as sources for further
folding.
For the AMDGPU target this happens with sequences like:
s_mov_b32 s0, 12345
v_mov_b32 v0, s0
The second instruction will be folded to:
v_mov_b32 v0, 12345
With this patch, the immediate value 12345 can then be folded into
further uses of v0.