Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9115,6 +9115,63 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
MachineOperand &Src1 = Inst.getOperand(2);
const DebugLoc &DL = Inst.getDebugLoc();

if (ST.useRealTrue16Insts()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This probably should get a new mir test

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah we already have a test for it here fix-sgpr-copies-f16-true16.mir

Register SrcReg0 = Src0.getReg();
Register SrcReg1 = Src1.getReg();

if (!RI.isVGPR(MRI, SrcReg0)) {
SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should use copy, may need a separate path if there are non-register inputs

}
if (!RI.isVGPR(MRI, SrcReg1)) {
SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
}
bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

VGPR_16 is not a subclass of VGPR_32, how could you constrain it?
Also it is an obscure cast from TargetRegisterClass * to bool.

Copy link
Contributor Author

@broxigarchen broxigarchen Oct 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not trying to constrain but just checking if it's vgpr_16 or not. Maybe I can check the regclass directly

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's cleaner, or check getRegBitWidth().

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Avoid using reg bitwidth, it usually works out cleanest to try the various register class matching functions

bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);

auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
switch (Inst.getOpcode()) {
case AMDGPU::S_PACK_LL_B32_B16: {
NewMI
.addReg(SrcReg0, 0,
isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
.addImm(AMDGPU::lo16)
.addReg(SrcReg1, 0,
isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
.addImm(AMDGPU::hi16);
} break;
case AMDGPU::S_PACK_LH_B32_B16: {
NewMI
.addReg(SrcReg0, 0,
isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
.addImm(AMDGPU::lo16)
.addReg(SrcReg1, 0, AMDGPU::hi16)
.addImm(AMDGPU::hi16);
} break;
case AMDGPU::S_PACK_HL_B32_B16: {
NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
.addImm(AMDGPU::lo16)
.addReg(SrcReg1, 0,
isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
.addImm(AMDGPU::hi16);
} break;
case AMDGPU::S_PACK_HH_B32_B16: {
NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
.addImm(AMDGPU::lo16)
.addReg(SrcReg1, 0, AMDGPU::hi16)
.addImm(AMDGPU::hi16);
} break;
default:
llvm_unreachable("unhandled s_pack_* instruction");
}

MachineOperand &Dest = Inst.getOperand(0);
MRI.replaceRegWith(Dest.getReg(), ResultReg);
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
return;
}

switch (Inst.getOpcode()) {
case AMDGPU::S_PACK_LL_B32_B16: {
Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Expand Down
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/AMDGPU/add.v2i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,7 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -790,11 +790,9 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.h
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1
; GFX11-TRUE16-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
Expand Down
22,342 changes: 14,415 additions & 7,927 deletions llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll

Large diffs are not rendered by default.

2,356 changes: 1,536 additions & 820 deletions llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll

Large diffs are not rendered by default.

5,894 changes: 3,865 additions & 2,029 deletions llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll

Large diffs are not rendered by default.

Loading