-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[AMDGPU][True16][CodeGen] si-fix-sgpr-copies legalize size mismatched V2S copy with subreg case #161290
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
✅ With the latest revision this PR passed the C/C++ code formatter. |
e90cf7f
to
f50f348
Compare
cd3fc90
to
b21f82c
Compare
b21f82c
to
854a465
Compare
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesIn true16 mode isel put 16bit in both vgpr16 and sreg32. Thus si-fix-sgpr-copies needs to legalize register-size-mismatched insts in the moveToVALU lowering process For special inst, previously we legalized:
with patch #144819, case2 is removed. This patch removes case2 support, and added the missing subreg cases:
This is a follow up of #160891 Patch is 152.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/161290.diff 11 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 044ea866342c2..10283a2206732 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7527,6 +7527,11 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
return;
unsigned Opcode = MI.getOpcode();
+ if (Opcode == AMDGPU::REG_SEQUENCE) {
+ legalizeSpecialInst_t16(MI, MRI);
+ return;
+ }
+
MachineBasicBlock *MBB = MI.getParent();
// Legalize operands and check for size mismatch
if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
@@ -7565,6 +7570,65 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
legalizeOperandsVALUt16(MI, OpIdx, MRI);
}
+// Legalize operands of size-mismatches special inst between 16bit and 32bit
+// in moveToVALU lowering in true16 mode. This caused by 16bit
+// placed in both vgpr16 and sreg32 by isel. Including cases:
+// Copy
+// 1. dst32 = copy vgpr16 => dst32 = REG_SEQUENCE(vgpr16, lo16)
+// 2. dst32 = copy .lo16:vgpr32 / dst32 = copy .hi16:vgpr32
+// => dst32 = REG_SEQUENCE(.lo16/hi16:vgpr32, lo16)
+// 3. sgpr16 = copy vgpr32/... (skipped, isel do not generate sgpr16)
+//
+// Reg_sequence
+// dst32 = reg_sequence(vgpr32, lo16/hi16)
+// => dst32 = reg_sequence(.lo16:vgpr32, lo16/hi16)
+//
+// This can be removed after we have sgpr16 in place.
+void SIInstrInfo::legalizeSpecialInst_t16(MachineInstr &Inst,
+ MachineRegisterInfo &MRI) const {
+ unsigned Opcode = Inst.getOpcode();
+ const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
+ switch (Opcode) {
+ case AMDGPU::COPY: {
+ Register SrcReg = Inst.getOperand(1).getReg();
+ if (!SrcReg.isVirtual() || !RI.isVGPR(MRI, SrcReg))
+ return;
+
+ bool SetSubReg = false;
+ Register SrcSubReg = Inst.getOperand(1).getSubReg();
+ const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
+ if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
+ } else if (NewDstRC == &AMDGPU::VGPR_32RegClass &&
+ (SrcSubReg == AMDGPU::hi16 || SrcSubReg == AMDGPU::lo16)) {
+ SetSubReg = true;
+ } else
+ return;
+
+ Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+ BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
+ get(AMDGPU::IMPLICIT_DEF), Undef);
+ Inst.setDesc(get(AMDGPU::REG_SEQUENCE));
+ if (SetSubReg)
+ Inst.getOperand(1).setSubReg(SrcSubReg);
+
+ Inst.addOperand(MachineOperand::CreateImm(AMDGPU::lo16));
+ Inst.addOperand(MachineOperand::CreateReg(Undef, 0));
+ Inst.addOperand(MachineOperand::CreateImm(AMDGPU::hi16));
+ } break;
+ case AMDGPU::REG_SEQUENCE: {
+ for (unsigned I = 0, E = (Inst.getNumOperands() - 1) / 2; I < E; ++I) {
+ Register SrcReg = Inst.getOperand(1 + 2 * I).getReg();
+ auto SubReg = Inst.getOperand(1 + 2 * I + 1).getImm();
+ if (SrcReg.isVirtual() && RI.isVGPR(MRI, SrcReg) &&
+ MRI.constrainRegClass(SrcReg, &AMDGPU::VGPR_32RegClass) &&
+ (SubReg == AMDGPU::lo16 || SubReg == AMDGPU::hi16)) {
+ Inst.getOperand(1 + 2 * I).setSubReg(AMDGPU::lo16);
+ }
+ }
+ } break;
+ }
+}
+
void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
MachineDominatorTree *MDT) const {
@@ -8083,6 +8147,9 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
+ if (ST.useRealTrue16Insts())
+ legalizeSpecialInst_t16(Inst, MRI);
+
if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
// Instead of creating a copy where src and dst are the same register
@@ -8105,38 +8172,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
return;
}
- // If this is a v2s copy between 16bit and 32bit reg,
- // replace vgpr copy to reg_sequence/extract_subreg
- // This can be remove after we have sgpr16 in place
- if (ST.useRealTrue16Insts() && Inst.isCopy() &&
- Inst.getOperand(1).getReg().isVirtual() &&
- RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
- const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
- if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
- Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
- Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
- BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
- get(AMDGPU::IMPLICIT_DEF), Undef);
- BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
- get(AMDGPU::REG_SEQUENCE), NewDstReg)
- .addReg(Inst.getOperand(1).getReg())
- .addImm(AMDGPU::lo16)
- .addReg(Undef)
- .addImm(AMDGPU::hi16);
- Inst.eraseFromParent();
- MRI.replaceRegWith(DstReg, NewDstReg);
- addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
- return;
- } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
- AMDGPU::lo16)) {
- Inst.getOperand(1).setSubReg(AMDGPU::lo16);
- Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
- MRI.replaceRegWith(DstReg, NewDstReg);
- addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
- return;
- }
- }
-
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);
legalizeOperands(Inst, MDT);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 31a2d55e1baad..9728ed4b6e002 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1375,6 +1375,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
MachineRegisterInfo &MRI) const;
void legalizeOperandsVALUt16(MachineInstr &Inst, unsigned OpIdx,
MachineRegisterInfo &MRI) const;
+ void legalizeSpecialInst_t16(MachineInstr &Inst,
+ MachineRegisterInfo &MRI) const;
/// Replace the instructions opcode with the equivalent VALU
/// opcode. This function will also move the users of MachineInstruntions
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index 47cb6bd3b3bb6..03672fb760586 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -4913,12 +4913,10 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -8342,12 +8340,10 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20i32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -12629,12 +12625,10 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -16043,12 +16037,10 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20f32_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -19655,12 +19647,10 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -23094,12 +23084,10 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10i64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -25911,12 +25899,10 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -29258,12 +29244,10 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10f64_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v32.h
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v1
; GFX11-TRUE16-NEXT: s_lshr_b32 s41, s29, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s28, 16
@@ -31057,12 +31041,10 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v40f16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v19.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
@@ -31074,12 +31056,12 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB57_3
@@ -31103,11 +31085,11 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
@@ -31123,7 +31105,7 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, s11, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, s0, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, s1, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, s2, 3 op_sel_hi:[1,0]
@@ -31168,9 +31150,9 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v33, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v35, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s8
; GFX11-TRUE16-NEXT: .LBB57_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
@@ -32879,12 +32861,10 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v40i16_scalar:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, 0
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v1.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s29, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v19.h
; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s28, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s27, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s26, 16
@@ -32896,12 +32876,12 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_lshr_b32 s12, s20, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s11, s19, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s10, s18, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s17, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s17, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s16, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s2, 16
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s0, 16
; GFX11-TRUE16-NEXT: s_mov_b32 s46, 0
; GFX11-TRUE16-NEXT: s_and_b32 s47, vcc_lo, exec_lo
; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_3
@@ -32925,11 +32905,11 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s12, s20, s12
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s11, s19, s11
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s10, s18, s10
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s17, s8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s9, s17, s9
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s16, s7
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s6
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s5
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s9
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
@@ -32945,7 +32925,7 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, s12 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, s11 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, s0 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, s1 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, s2 op_sel_hi:[0,1]
@@ -32990,9 +32970,9 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, s15 :: v_dual_mov_b32 v29, s14
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, s13 :: v_dual_mov_b32 v31, s12
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v33, s10
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v35, s7
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v35, s7
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s9
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s8
; GFX11-TRUE16-NEXT: .LBB59_5: ; %end
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index 2cc7c448b2e11..afb2e96a0079f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -5328,15 +5328,11 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, ...
[truncated]
|
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]].lo16, %subreg.lo16, [[DEF1]], %subreg.hi16 | ||
; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec | ||
%0:vgpr_32 = IMPLICIT_DEF | ||
%1:sreg_32 = COPY %0.lo16:vgpr_32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be illegal MIR to start with, I thought the plan was to fix emitting all the mismatched size copies?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've cleaned many isel patterns to make sure we don't have vgpr32/vgpr16 mess, so that all vgpr size is selected correctly regarding the data type.
However, we are still emitting some illegal mir from isel. This happens when isel sometimes put 16bit into vgpr16 and sometimes in sreg32. We use uniform/divergent to try selecting different trunc/ext/build_vector
patterns for sreg32 vs vgpr16, but they do not always match, i.e. the return from a true16 load is in vgpr16 but it's uniformed, and thus the isel could still select the pattern for sreg32 and thus generating these bad mir.
The current plan is to eliminate these illegal cases in fix-sgpr-copies pass
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We probably should not have i16 as a legal type for true16. Can you try using ValueTypeByHwMode to remove i16 for the true16 case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's new to me. I'll try to look into this. Thanks Matt!
if (SetSubReg) | ||
Inst.getOperand(1).setSubReg(SrcSubReg); | ||
|
||
Inst.addOperand(MachineOperand::CreateImm(AMDGPU::lo16)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use MachineInstrBuilder?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't want to build a new instruction here since this function should only legalize the operands. The caller would still expects the inst reference to be valid
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can still use MachineInstrBuilder to append operands to an existing instruction
if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) { | ||
} else if (NewDstRC == &AMDGPU::VGPR_32RegClass && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Empty block and probably should do something with the result?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This empty block is just to serve the return in the else branch
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shuffle the condition around to avoid the empty block
In true16 mode isel put 16bit in both vgpr16 and sreg32. Thus si-fix-sgpr-copies needs to legalize register-size-mismatched insts in the moveToVALU lowering process
For special inst, previously we legalized:
with patch #144819, case2 is removed.
This patch removes case2 support, and added the missing subreg cases:
This is a follow up of #160891