-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[LLVM] Insert IMPLICIT_DEF for a register sequence if any operand is undef #158000
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-regalloc @llvm/pr-subscribers-llvm-globalisel Author: Abhinav Garg (abhigargrepo) ChangesCurrently, live interval analysis is unable to track the undefined sub parts of a register tuple. Patch is 3.95 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/158000.diff 142 Files Affected:
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 8d94b40a41bea..7e161bfb39ac1 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1988,6 +1988,7 @@ void TwoAddressInstructionImpl::eliminateRegSequence(
SmallVector<Register, 4> OrigRegs;
VNInfo *DefVN = nullptr;
+ bool DefEmitted = false;
if (LIS) {
OrigRegs.push_back(MI.getOperand(0).getReg());
for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2)
@@ -1998,9 +1999,17 @@ void TwoAddressInstructionImpl::eliminateRegSequence(
.valueOut();
}
}
-
+ for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2)
+ if (MI.getOperand(i).isReg() && MI.getOperand(i).isUndef()) {
+ // Insert the IMPLICIT_DEF on dst register.
+ MachineInstr *DefMI =
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(TargetOpcode::IMPLICIT_DEF), DstReg);
+ MBBI = DefMI;
+ DefEmitted = true;
+ break;
+ }
LaneBitmask UndefLanes = LaneBitmask::getNone();
- bool DefEmitted = false;
for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2) {
MachineOperand &UseMO = MI.getOperand(i);
Register SrcReg = UseMO.getReg();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 1cd9c0bfeb7e6..84247841691ab 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -8,37 +8,40 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac
; GFX10-LABEL: v_mul_i64_no_zext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 3, v0
+; GFX10-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1]
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v8, s[0:1]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v8, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0
-; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
+; GFX10-NEXT: v_mov_b32_e32 v6, v5
+; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[6:7]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3]
+; GFX10-NEXT: global_store_dwordx2 v8, v[4:5], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_no_zext:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b64 v[0:1], v9, s[0:1]
-; GFX11-NEXT: global_load_b64 v[2:3], v9, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v10, s[0:1]
+; GFX11-NEXT: global_load_b64 v[2:3], v10, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v3, v[5:6]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7]
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v3, v[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v5, v7
-; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3]
+; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v1, v2, v[8:9]
+; GFX11-NEXT: global_store_b64 v10, v[4:5], s[2:3]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -60,13 +63,14 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0
+; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
-; GFX10-NEXT: global_load_dword v4, v3, s[6:7]
+; GFX10-NEXT: global_load_dword v6, v3, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v0, v4, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v4, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v0, v6, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v6, v[4:5]
; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
@@ -78,18 +82,21 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v1, s[2:3]
-; GFX11-NEXT: global_load_b32 v5, v2, s[4:5]
+; GFX11-NEXT: global_load_b32 v8, v2, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v8, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v8, v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v6
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -112,13 +119,14 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v2, s[2:3]
+; GFX10-NEXT: global_load_dword v6, v2, s[2:3]
; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v6, v0, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v6, v1, v[4:5]
; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
@@ -130,18 +138,21 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v5, v1, s[2:3]
+; GFX11-NEXT: global_load_b32 v8, v1, s[2:3]
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v8, v0, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v8, v1, v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v6
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -210,14 +221,15 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dword v4, v2, s[2:3]
+; GFX10-NEXT: global_load_dword v6, v2, s[2:3]
; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v6, v0, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v6, v1, v[4:5]
; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
@@ -229,18 +241,21 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b32 v5, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v8, v0, s[2:3]
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v8, v0, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v8, v1, v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v6
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -390,16 +405,17 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX10-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_and_b32_e32 v6, 0xfff00000, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xfff00000, v0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v6, v2, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, v5
-; GFX10-NEXT: v_mad_u64_u32 v[5:6], s2, v6, v3, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v0, v2, 0
+; GFX10-NEXT: v_mov_b32_e32 v6, v5
+; GFX10-NEXT: v_mad_u64_u32 v[5:6], s2, v0, v3, v[6:7]
; GFX10-NEXT: v_and_b32_e32 v0, 0xf00f, v1
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v0, v2, v[5:6]
; GFX10-NEXT: v_mov_b32_e32 v5, v0
@@ -413,6 +429,7 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -420,15 +437,15 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
; GFX11-NEXT: global_load_b64 v[2:3], v2, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xfff00000, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, 0
-; GFX11-NEXT: v_mov_b32_e32 v0, v5
+; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v3, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v3, v[6:7]
; GFX11-NEXT: v_and_b32_e32 v3, 0xf00f, v1
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6]
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[8:9]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: global_store_b64 v0, v[4:5], s[0:1]
@@ -510,7 +527,9 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
; GFX10-NEXT: ; %bb.1: ; %else
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s3, v2, v4, 0
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s3, v2, v5, v[1:2]
+; GFX10-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s3, v2, v5, v[3:4]
; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX10-NEXT: .LBB10_2: ; %Flow
@@ -547,11 +566,14 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
; GFX11-NEXT: ; %bb.1: ; %else
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0
+; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2]
-; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX11-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v1
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v2, v5, v[3:4]
; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, v6
; GFX11-NEXT: .LBB10_2: ; %Flow
; GFX11-NEXT: s_and_not1_saveexec_b32 s2, s2
; GFX11-NEXT: s_cbranch_execz .LBB10_4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 637aaf7529364..b2517431f6fbf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -548,7 +548,9 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
; GCN-NEXT: v_mov_b32_e32 v4, v0
; GCN-NEXT: v_mov_b32_e32 v5, v1
; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
+; GCN-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GCN-NEXT: v_mov_b32_e32 v6, v1
+; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[6:7]
; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -557,8 +559,10 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: v_mov_b32_e32 v5, v1
+; GFX10-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0
-; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v4, v3, v[1:2]
+; GFX10-NEXT: v_mov_b32_e32 v6, v1
+; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v4, v3, v[6:7]
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v5, v2, v[3:4]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -567,9 +571,11 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v2
; GFX11-NEXT: v_mov_b32_e32 v6, v1
+; GFX11-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v5, 0
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v4, v3, v[1:2]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v5, v[7:8]
+; GFX11-NEXT: v_mov_b32_e32 v7, v1
+; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v4, v3, v[7:8]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v5, v[9:10]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_mul_i64:
@@ -3129,34 +3135,40 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b64 s[0:1], 0
-; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX7-NEXT: buffer_load_dword v6, v[2:3], s[0:3], 0 addr64
+; GFX7-NEXT: v_mov_b32_e32 v7, 0x50
+; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
-; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4]
+; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, 0
+; GFX7-NEXT: v_ashrrev_i32_e32 v6, 31, v6
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
+; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[4:5]
; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_mul_u64_sext_with_vregs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: flat_load_dword v4, v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX8-NEXT: flat_load_dword v6, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v7, 0x50
+; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
-; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v7, 0
+; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v6
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v6, v7, v[4:5]
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_sext_with_vregs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: global_load_dword v4, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX9-NEXT: global_load_dword v6, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v7, 0x50
+; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
-; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v7, 0
+; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v6
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v6, v7, v[4:5]
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_endpgm
;
@@ -3165,8 +3177,10 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0
-; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, 0x50, v4, v[3:4]
+; GFX10-NEXT: ...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Abhinav Garg (abhigargrepo) ChangesCurrently, live interval analysis is unable to track the undefined sub parts of a register tuple. Patch is 3.95 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/158000.diff 142 Files Affected:
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 8d94b40a41bea..7e161bfb39ac1 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1988,6 +1988,7 @@ void TwoAddressInstructionImpl::eliminateRegSequence(
SmallVector<Register, 4> OrigRegs;
VNInfo *DefVN = nullptr;
+ bool DefEmitted = false;
if (LIS) {
OrigRegs.push_back(MI.getOperand(0).getReg());
for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2)
@@ -1998,9 +1999,17 @@ void TwoAddressInstructionImpl::eliminateRegSequence(
.valueOut();
}
}
-
+ for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2)
+ if (MI.getOperand(i).isReg() && MI.getOperand(i).isUndef()) {
+ // Insert the IMPLICIT_DEF on dst register.
+ MachineInstr *DefMI =
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(TargetOpcode::IMPLICIT_DEF), DstReg);
+ MBBI = DefMI;
+ DefEmitted = true;
+ break;
+ }
LaneBitmask UndefLanes = LaneBitmask::getNone();
- bool DefEmitted = false;
for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2) {
MachineOperand &UseMO = MI.getOperand(i);
Register SrcReg = UseMO.getReg();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 1cd9c0bfeb7e6..84247841691ab 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -8,37 +8,40 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac
; GFX10-LABEL: v_mul_i64_no_zext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 3, v0
+; GFX10-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1]
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v8, s[0:1]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v8, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0
-; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
+; GFX10-NEXT: v_mov_b32_e32 v6, v5
+; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[6:7]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3]
+; GFX10-NEXT: global_store_dwordx2 v8, v[4:5], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_no_zext:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b64 v[0:1], v9, s[0:1]
-; GFX11-NEXT: global_load_b64 v[2:3], v9, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v10, s[0:1]
+; GFX11-NEXT: global_load_b64 v[2:3], v10, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v3, v[5:6]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7]
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v3, v[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v5, v7
-; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3]
+; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v1, v2, v[8:9]
+; GFX11-NEXT: global_store_b64 v10, v[4:5], s[2:3]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -60,13 +63,14 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0
+; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
-; GFX10-NEXT: global_load_dword v4, v3, s[6:7]
+; GFX10-NEXT: global_load_dword v6, v3, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v0, v4, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v4, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v0, v6, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v6, v[4:5]
; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
@@ -78,18 +82,21 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v1, s[2:3]
-; GFX11-NEXT: global_load_b32 v5, v2, s[4:5]
+; GFX11-NEXT: global_load_b32 v8, v2, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v8, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v8, v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v6
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -112,13 +119,14 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v2, s[2:3]
+; GFX10-NEXT: global_load_dword v6, v2, s[2:3]
; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v6, v0, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v6, v1, v[4:5]
; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
@@ -130,18 +138,21 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v5, v1, s[2:3]
+; GFX11-NEXT: global_load_b32 v8, v1, s[2:3]
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v8, v0, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v8, v1, v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v6
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -210,14 +221,15 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dword v4, v2, s[2:3]
+; GFX10-NEXT: global_load_dword v6, v2, s[2:3]
; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v6, v0, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v6, v1, v[4:5]
; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
@@ -229,18 +241,21 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b32 v5, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v8, v0, s[2:3]
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v8, v0, 0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v8, v1, v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v6
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -390,16 +405,17 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX10-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_and_b32_e32 v6, 0xfff00000, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xfff00000, v0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v6, v2, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, v5
-; GFX10-NEXT: v_mad_u64_u32 v[5:6], s2, v6, v3, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v0, v2, 0
+; GFX10-NEXT: v_mov_b32_e32 v6, v5
+; GFX10-NEXT: v_mad_u64_u32 v[5:6], s2, v0, v3, v[6:7]
; GFX10-NEXT: v_and_b32_e32 v0, 0xf00f, v1
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v0, v2, v[5:6]
; GFX10-NEXT: v_mov_b32_e32 v5, v0
@@ -413,6 +429,7 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
@@ -420,15 +437,15 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
; GFX11-NEXT: global_load_b64 v[2:3], v2, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xfff00000, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, 0
-; GFX11-NEXT: v_mov_b32_e32 v0, v5
+; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v3, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v3, v[6:7]
; GFX11-NEXT: v_and_b32_e32 v3, 0xf00f, v1
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6]
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[8:9]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: global_store_b64 v0, v[4:5], s[0:1]
@@ -510,7 +527,9 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
; GFX10-NEXT: ; %bb.1: ; %else
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s3, v2, v4, 0
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s3, v2, v5, v[1:2]
+; GFX10-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s3, v2, v5, v[3:4]
; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX10-NEXT: .LBB10_2: ; %Flow
@@ -547,11 +566,14 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
; GFX11-NEXT: ; %bb.1: ; %else
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0
+; GFX11-NEXT: ; implicit-def: $vgpr3_vgpr4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2]
-; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX11-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v1
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v2, v5, v[3:4]
; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, v6
; GFX11-NEXT: .LBB10_2: ; %Flow
; GFX11-NEXT: s_and_not1_saveexec_b32 s2, s2
; GFX11-NEXT: s_cbranch_execz .LBB10_4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 637aaf7529364..b2517431f6fbf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -548,7 +548,9 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
; GCN-NEXT: v_mov_b32_e32 v4, v0
; GCN-NEXT: v_mov_b32_e32 v5, v1
; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
+; GCN-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GCN-NEXT: v_mov_b32_e32 v6, v1
+; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[6:7]
; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -557,8 +559,10 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: v_mov_b32_e32 v5, v1
+; GFX10-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0
-; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v4, v3, v[1:2]
+; GFX10-NEXT: v_mov_b32_e32 v6, v1
+; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v4, v3, v[6:7]
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v5, v2, v[3:4]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -567,9 +571,11 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v2
; GFX11-NEXT: v_mov_b32_e32 v6, v1
+; GFX11-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v5, 0
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v4, v3, v[1:2]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v5, v[7:8]
+; GFX11-NEXT: v_mov_b32_e32 v7, v1
+; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v4, v3, v[7:8]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v5, v[9:10]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_mul_i64:
@@ -3129,34 +3135,40 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b64 s[0:1], 0
-; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX7-NEXT: buffer_load_dword v6, v[2:3], s[0:3], 0 addr64
+; GFX7-NEXT: v_mov_b32_e32 v7, 0x50
+; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
-; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4]
+; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, 0
+; GFX7-NEXT: v_ashrrev_i32_e32 v6, 31, v6
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
+; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[4:5]
; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_mul_u64_sext_with_vregs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: flat_load_dword v4, v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX8-NEXT: flat_load_dword v6, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v7, 0x50
+; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
-; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v7, 0
+; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v6
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v6, v7, v[4:5]
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_sext_with_vregs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: global_load_dword v4, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX9-NEXT: global_load_dword v6, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v7, 0x50
+; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
-; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v7, 0
+; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v6
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v6, v7, v[4:5]
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_endpgm
;
@@ -3165,8 +3177,10 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0
-; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, 0x50, v4, v[3:4]
+; GFX10-NEXT: ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This could use a new dedicated MIR test. In particular should make sure undef REG_SEQUENCE inputs with subregister uses are handled properly
} | ||
|
||
for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2) | ||
if (MI.getOperand(i).isReg() && MI.getOperand(i).isUndef()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You do not need to guard against non-register operands to REG_SEQUENCE
// Insert the IMPLICIT_DEF on dst register. | ||
MachineInstr *DefMI = | ||
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), | ||
TII->get(TargetOpcode::IMPLICIT_DEF), DstReg); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is emitting a full register def for each input. You only need to insert a subregister def for this particular subregister from the reg_sequence input
i.e., something like
BuildMI(IMPLICIT_DEF)
.addDef(DstReg, 0, MI.getOperand(I + 1).getSubReg())
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will it matter in the terms of performance?
I mean, this patch will first insert IMPLICIT_DEF for the whole tuple and then next instruction will override the defined sub parts leaving the other undef sub parts as IMPLICIT_DEF.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it could. But it's also important to treat sub registers consistently
Needs a better explanation. Is this fixing a bug? Or improving the quality of LiveIntervals in some way? (How?) Why does it affect codegen? Could also do with an example of where a new IMPLICIT_DEF will be inserted. |
Currently, live interval analysis is unable to track the undefined sub parts of a register tuple.
This patch will insert IMPLICIT_DEF for such tuples in two address instruction pass so as to track the live intervals correctly.