-
Notifications
You must be signed in to change notification settings - Fork 11.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][True16][MC] Support v_swap_b16. #100442
Conversation
Thank you for submitting a Pull Request (PR) to the LLVM Project! This PR will be automatically labeled and the relevant teams will be If you wish to, you can add reviewers by using the "Reviewers" section on this page. If this is not working for you, it is probably because you do not have write If you have received no comments on your PR for a week, you can request a review If you have further questions, they may be answered by the LLVM GitHub User Guide. You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums. |
b5b892a
to
7b60ebd
Compare
@llvm/pr-subscribers-mc Author: Brox Chen (broxigarchen) ChangesAdded v_swap_b16 support in AMDGPU codeGen Full diff: https://github.com/llvm/llvm-project/pull/100442.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 1fe8beafd5e5d..9b506eb0a711a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -321,6 +321,12 @@ def VOPDstOperand_t16Lo128 : VOPDstOperand <VGPR_16_Lo128> {
let DecoderMethod = "DecodeVGPR_16_Lo128RegisterClass";
}
+// Source-encoded destination operand for instructions like v_swap_b16.
+def VOPSrcEncodedDstOperand_t16Lo128 : VOPDstOperand <VGPR_16_Lo128> {
+ let EncoderMethod = VSrcT_b16_Lo128.EncoderMethod;
+ let DecoderMethod = VSrcT_b16_Lo128.DecoderMethod;
+}
+
class VINTRPe <bits<2> op> : Enc32 {
bits<8> vdst;
bits<8> vsrc;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 2c0d61ee4afa1..c08903d03939c 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -729,7 +729,22 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1
let isAsCheapAsAMove = 1;
}
+def VOP_SWAP_I16 : VOPProfile_True16<VOP_I16_I16> {
+ let Outs32 = (outs VOPDstOperand_t16Lo128:$vdst,
+ VOPSrcEncodedDstOperand_t16Lo128:$vdst1);
+ let Ins32 = (ins VOPSrcEncodedDstOperand_t16Lo128:$src0,
+ VOPDstOperand_t16Lo128:$src1);
+ let Asm32 = " $vdst, $src0";
+}
+
let SubtargetPredicate = isGFX11Plus in {
+ def V_SWAP_B16 : VOP1_Pseudo<"v_swap_b16", VOP_SWAP_I16, [], /* VOP1Only= */ 1> {
+ let Constraints = "$vdst = $src1, $vdst1 = $src0";
+ let DisableEncoding = "$vdst1, $src1";
+ let SchedRW = [Write64Bit, Write64Bit];
+ }
+ // TODO-GFX11 select new insts
+ defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>;
// Restrict src0 to be VGPR
def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
[], /*VOP1Only=*/ 1> {
@@ -952,7 +967,8 @@ defm V_CTZ_I32_B32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03a,
"V_FFBL_B32", "v_ctz_i32_b32">;
defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03b,
"V_FFBH_I32", "v_cls_i32">;
-defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11_gfx12<0x067>;
+defm V_SWAP_B16 : VOP1Only_Real_gfx11_gfx12<0x066>;
+defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11_gfx12<0x067>;
defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x01c, "v_mov_b16">;
defm V_NOT_B16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x069, "v_not_b16">;
defm V_CVT_I32_I16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">;
diff --git a/llvm/test/MC/AMDGPU/gfx10_unsupported.s b/llvm/test/MC/AMDGPU/gfx10_unsupported.s
index 46b4e6ffb4037..1374417ac354b 100644
--- a/llvm/test/MC/AMDGPU/gfx10_unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx10_unsupported.s
@@ -3287,6 +3287,9 @@ v_subrev_u32_e64 v255, s[12:13], v1, v2
v_subrev_u32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_swap_b16 v0.l, v0.l
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
v_wmma_bf16_16x16x16_bf16 v[16:19], 1.0, v[8:15], v[16:19]
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
index d95ef6f15e48d..90d5ca7f72751 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
@@ -3448,6 +3448,15 @@ v_sqrt_f64 v[5:6], src_scc
v_sqrt_f64 v[254:255], 0xaf123456
// GFX11: encoding: [0xff,0x68,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+v_swap_b16 v5.l, v1.h
+// GFX11: encoding: [0x81,0xcd,0x0a,0x7e]
+
+v_swap_b16 v5.h, v1.l
+// GFX11: encoding: [0x01,0xcd,0x0a,0x7f]
+
+v_swap_b16 v127.l, v127.l
+// GFX11: encoding: [0x7f,0xcd,0xfe,0x7e]
+
v_swap_b32 v5, v1
// GFX11: encoding: [0x01,0xcb,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
index 5b5381b752feb..ab587a524fc6e 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
@@ -211,6 +211,18 @@ v_sqrt_f16_e32 v255.l, v1.l
v_sqrt_f16_e32 v5.l, v199.l
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_swap_b16_e32 v128.l, v0.l
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swap_b16_e32 v0.l, s0
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swap_b16_e32 v0.l, 0
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swap_b16_e32 v0.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
v_trunc_f16_e32 v128, 0xfe0b
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
diff --git a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
index f6d2a19326e1d..c15b65728b247 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
@@ -19,6 +19,15 @@
# W64: [[@LINE+1]]:1: warning: invalid instruction encoding
0xff,0x04,0x02,0xc9,0x03,0x03,0x06,0x05,0x56,0x34,0x12,0xaf
+# GFX11: v_swap_b16 v5.h, s1/*Invalid register, operand has 'VGPR_16_Lo128' register class*/ ; encoding: [0x01,0xcc,0x0a,0x7f]
+0x01,0xcc,0x0a,0x7f
+
+# GFX11: v_swap_b16 v5.h, 0x3c00/*Invalid immediate*/ ; encoding: [0x00,0xcc,0x0a,0x7f]
+0xf2,0xcc,0x0a,0x7f
+
+# GFX11: v_swap_b16 v5.h, 0x78563412/*Invalid immediate*/ ; encoding: [0x12,0xcc,0x0a,0x7f]
+0xff,0xcc,0x0a,0x7f,0x12,0x34,0x56,0x78
+
# W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c]
# W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c]
0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
index b176a57d70f86..778f7deb4ec1a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
@@ -3344,6 +3344,12 @@
# GFX11: v_sqrt_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x68,0xfc,0x7f,0x56,0x34,0x12,0xaf]
0xff,0x68,0xfc,0x7f,0x56,0x34,0x12,0xaf
+# GFX11: v_swap_b16 v5.l, v1.h ; encoding: [0x81,0xcd,0x0a,0x7e]
+0x81,0xcd,0x0a,0x7e
+
+# GFX11: v_swap_b16 v5.h, v1.l ; encoding: [0x01,0xcd,0x0a,0x7f]
+0x01,0xcd,0x0a,0x7f
+
# GFX11: v_swap_b32 v5, v1 ; encoding: [0x01,0xcb,0x0a,0x7e]
0x01,0xcb,0x0a,0x7e
|
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesAdded v_swap_b16 support in AMDGPU codeGen Full diff: https://github.com/llvm/llvm-project/pull/100442.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 1fe8beafd5e5d..9b506eb0a711a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -321,6 +321,12 @@ def VOPDstOperand_t16Lo128 : VOPDstOperand <VGPR_16_Lo128> {
let DecoderMethod = "DecodeVGPR_16_Lo128RegisterClass";
}
+// Source-encoded destination operand for instructions like v_swap_b16.
+def VOPSrcEncodedDstOperand_t16Lo128 : VOPDstOperand <VGPR_16_Lo128> {
+ let EncoderMethod = VSrcT_b16_Lo128.EncoderMethod;
+ let DecoderMethod = VSrcT_b16_Lo128.DecoderMethod;
+}
+
class VINTRPe <bits<2> op> : Enc32 {
bits<8> vdst;
bits<8> vsrc;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 2c0d61ee4afa1..c08903d03939c 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -729,7 +729,22 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1
let isAsCheapAsAMove = 1;
}
+def VOP_SWAP_I16 : VOPProfile_True16<VOP_I16_I16> {
+ let Outs32 = (outs VOPDstOperand_t16Lo128:$vdst,
+ VOPSrcEncodedDstOperand_t16Lo128:$vdst1);
+ let Ins32 = (ins VOPSrcEncodedDstOperand_t16Lo128:$src0,
+ VOPDstOperand_t16Lo128:$src1);
+ let Asm32 = " $vdst, $src0";
+}
+
let SubtargetPredicate = isGFX11Plus in {
+ def V_SWAP_B16 : VOP1_Pseudo<"v_swap_b16", VOP_SWAP_I16, [], /* VOP1Only= */ 1> {
+ let Constraints = "$vdst = $src1, $vdst1 = $src0";
+ let DisableEncoding = "$vdst1, $src1";
+ let SchedRW = [Write64Bit, Write64Bit];
+ }
+ // TODO-GFX11 select new insts
+ defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>;
// Restrict src0 to be VGPR
def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
[], /*VOP1Only=*/ 1> {
@@ -952,7 +967,8 @@ defm V_CTZ_I32_B32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03a,
"V_FFBL_B32", "v_ctz_i32_b32">;
defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03b,
"V_FFBH_I32", "v_cls_i32">;
-defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11_gfx12<0x067>;
+defm V_SWAP_B16 : VOP1Only_Real_gfx11_gfx12<0x066>;
+defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11_gfx12<0x067>;
defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x01c, "v_mov_b16">;
defm V_NOT_B16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x069, "v_not_b16">;
defm V_CVT_I32_I16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">;
diff --git a/llvm/test/MC/AMDGPU/gfx10_unsupported.s b/llvm/test/MC/AMDGPU/gfx10_unsupported.s
index 46b4e6ffb4037..1374417ac354b 100644
--- a/llvm/test/MC/AMDGPU/gfx10_unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx10_unsupported.s
@@ -3287,6 +3287,9 @@ v_subrev_u32_e64 v255, s[12:13], v1, v2
v_subrev_u32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+v_swap_b16 v0.l, v0.l
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
v_wmma_bf16_16x16x16_bf16 v[16:19], 1.0, v[8:15], v[16:19]
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
index d95ef6f15e48d..90d5ca7f72751 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
@@ -3448,6 +3448,15 @@ v_sqrt_f64 v[5:6], src_scc
v_sqrt_f64 v[254:255], 0xaf123456
// GFX11: encoding: [0xff,0x68,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+v_swap_b16 v5.l, v1.h
+// GFX11: encoding: [0x81,0xcd,0x0a,0x7e]
+
+v_swap_b16 v5.h, v1.l
+// GFX11: encoding: [0x01,0xcd,0x0a,0x7f]
+
+v_swap_b16 v127.l, v127.l
+// GFX11: encoding: [0x7f,0xcd,0xfe,0x7e]
+
v_swap_b32 v5, v1
// GFX11: encoding: [0x01,0xcb,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
index 5b5381b752feb..ab587a524fc6e 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
@@ -211,6 +211,18 @@ v_sqrt_f16_e32 v255.l, v1.l
v_sqrt_f16_e32 v5.l, v199.l
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_swap_b16_e32 v128.l, v0.l
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swap_b16_e32 v0.l, s0
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swap_b16_e32 v0.l, 0
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swap_b16_e32 v0.l, 0xfe0b
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
v_trunc_f16_e32 v128, 0xfe0b
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
diff --git a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
index f6d2a19326e1d..c15b65728b247 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
@@ -19,6 +19,15 @@
# W64: [[@LINE+1]]:1: warning: invalid instruction encoding
0xff,0x04,0x02,0xc9,0x03,0x03,0x06,0x05,0x56,0x34,0x12,0xaf
+# GFX11: v_swap_b16 v5.h, s1/*Invalid register, operand has 'VGPR_16_Lo128' register class*/ ; encoding: [0x01,0xcc,0x0a,0x7f]
+0x01,0xcc,0x0a,0x7f
+
+# GFX11: v_swap_b16 v5.h, 0x3c00/*Invalid immediate*/ ; encoding: [0x00,0xcc,0x0a,0x7f]
+0xf2,0xcc,0x0a,0x7f
+
+# GFX11: v_swap_b16 v5.h, 0x78563412/*Invalid immediate*/ ; encoding: [0x12,0xcc,0x0a,0x7f]
+0xff,0xcc,0x0a,0x7f,0x12,0x34,0x56,0x78
+
# W32: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c]
# W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c]
0x10,0x40,0x40,0xcc,0x00,0x11,0x42,0x1c
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
index b176a57d70f86..778f7deb4ec1a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1.txt
@@ -3344,6 +3344,12 @@
# GFX11: v_sqrt_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x68,0xfc,0x7f,0x56,0x34,0x12,0xaf]
0xff,0x68,0xfc,0x7f,0x56,0x34,0x12,0xaf
+# GFX11: v_swap_b16 v5.l, v1.h ; encoding: [0x81,0xcd,0x0a,0x7e]
+0x81,0xcd,0x0a,0x7e
+
+# GFX11: v_swap_b16 v5.h, v1.l ; encoding: [0x01,0xcd,0x0a,0x7f]
+0x01,0xcd,0x0a,0x7f
+
# GFX11: v_swap_b32 v5, v1 ; encoding: [0x01,0xcb,0x0a,0x7e]
0x01,0xcb,0x0a,0x7e
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Description is slightly misleading, this does not emit it from codegen and only handles the encoding/decoding
7b60ebd
to
6e732d4
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Needs gfx12 assembler and disassembler tests.
Should resolve
#61622
Can you please split this into one patch which just adds MC (assembler/disassembler) support and tests, and then a second patch which adds CodeGen support and tests? |
let SubtargetPredicate = isGFX11Plus in { | ||
def V_SWAP_B16 : VOP1_Pseudo<"v_swap_b16", VOP_SWAP_I16, [], /* VOP1Only= */ 1> { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you sure this opcode is VOP1 only? When I implemented support for v_swap_b16
in our compiler, I tested it with VOP3 encoding and it worked fine. VOP3 v_swap_b16 is nessecary to address v128-v255.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's VOP1 only in GFX11 and GFX12, and it does support v0-v255 for both operands. UPDATE: As a VOP1 instruction it only supports registers up to v127.l/.h, of course, which is very unfortunate.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That can't be true, VOP1 v_swap_b16 definitely operates on v0-v127.lo/.hi. There is no way to encode v128-v255 without VOP3, just like for other true16 instructions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, the lo/hi selector goes to the MSB of the operand fields, so can't encode registers beyond v127.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
From what I see, officially there's no VOP3 form, but I could ask ISA people to be sure. Which targets did you try this on, exactly?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I did all my gfx11 testing on navi31.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, I did inquire on this and will update this thread as soon as I have a response. Thanks for reporting.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The response from our HW guys reads that the VOP3 codes are not guaranteed to work correctly under all possible circumstances. Meaning, they should not be considered supported regardless of observed side effects.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for investigating, even if the answer is a bit disappointing.
eb09da0
to
d80b7b0
Compare
Thanks for the comment! Just trim the patch to contain only MC changes |
Can you please commit a separate patch first that contains the changes to llvm/test/MC/AMDGPU/gfx12_asm_vop1-fake16.s and llvm/test/MC/AMDGPU/gfx12_asm_vop1.s minus v_swap_b16? The files look fine, just adding that bulk test update should be separate from the functional change to v_swap_b16. |
Created a seperate PR here #100849 |
9b02a91
to
ca2fc62
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. If the VOP3 form is supported, we can add a follow up patch.
Thanks! |
Hi I think this PR is ready. Can anyone help to merge this PR? Thanks! |
@broxigarchen Congratulations on having your first Pull Request (PR) merged into the LLVM Project! Your changes will be combined with recent changes from other authors, then tested Please check whether problems have been caused by your change specifically, as How to do this, and the rest of the post-merge process, is covered in detail here. If your change does cause a problem, it may be reverted, or you can revert it yourself. If you don't get any reports, no action is required from you. Your changes are working as expected, well done! |
v_swap_b16 is not offically supported as VOP3, so it can't be used with v128-255. Tests show that VOP3 appears to work correctly, but according to AMD that should not be relied on. llvm/llvm-project#100442 (comment) Foz-DB Navi31: Totals from 6 (0.01% of 79395) affected shaders: Instrs: 64799 -> 65932 (+1.75%) CodeSize: 360180 -> 368440 (+2.29%) Latency: 1364648 -> 1365922 (+0.09%) InvThroughput: 635843 -> 636475 (+0.10%) Copies: 14766 -> 15698 (+6.31%) VALU: 38743 -> 39675 (+2.41%) Fixes: 80b8bbf ("aco/gfx11: use v_swap_b16") Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30515>
v_swap_b16 is not offically supported as VOP3, so it can't be used with v128-255. Tests show that VOP3 appears to work correctly, but according to AMD that should not be relied on. llvm/llvm-project#100442 (comment) Foz-DB Navi31: Totals from 6 (0.01% of 79395) affected shaders: Instrs: 64799 -> 65932 (+1.75%) CodeSize: 360180 -> 368440 (+2.29%) Latency: 1364648 -> 1365922 (+0.09%) InvThroughput: 635843 -> 636475 (+0.10%) Copies: 14766 -> 15698 (+6.31%) VALU: 38743 -> 39675 (+2.41%) Fixes: 80b8bbf ("aco/gfx11: use v_swap_b16") Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30515> (cherry picked from commit e0818cb)
support V_SWAP_B16 true16 encoding in asm/disasm for GFX11/12 Co-authored-by: guochen2 <guochen2@amd.com>
support V_SWAP_B16 true16 encoding in asm/disasm for GFX11/12