diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 48d4e259bc1ce..c97fef6d894ab 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -2520,16 +2520,22 @@ multiclass VOP2_Real_DOT_ACC_gfx10 op> : VOP2_Real_dpp_gfx10, VOP2_Real_dpp8_gfx10; +multiclass VOP2Only_Real_DOT_ACC_gfx10 op> : VOP2_Real_dpp_gfx10, + VOP2_Real_dpp8_gfx10 { + let IsSingle = 1 in + defm NAME : VOP2_Real_e32_gfx10; +} + let SubtargetPredicate = HasDot5Insts in { defm V_DOT2C_F32_F16 : VOP2_Real_DOT_ACC_gfx9<0x37>; // NB: Opcode conflicts with V_DOT8C_I32_I4 // This opcode exists in gfx 10.1* only - defm V_DOT2C_F32_F16 : VOP2_Real_DOT_ACC_gfx10<0x02>; + defm V_DOT2C_F32_F16 : VOP2Only_Real_DOT_ACC_gfx10<0x02>; } let SubtargetPredicate = HasDot6Insts in { defm V_DOT4C_I32_I8 : VOP2_Real_DOT_ACC_gfx9<0x39>; - defm V_DOT4C_I32_I8 : VOP2_Real_DOT_ACC_gfx10<0x0d>; + defm V_DOT4C_I32_I8 : VOP2Only_Real_DOT_ACC_gfx10<0x0d>; } let SubtargetPredicate = HasDot4Insts in { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll index 6a79ad85a9a28..d2608055eb491 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll @@ -13,7 +13,7 @@ define i32 @v_sdot4(i32 %a, i32 %b, i32 %c) { ; GFX10-LABEL: v_sdot4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_dot4c_i32_i8_e32 v2, v0, v1 +; GFX10-NEXT: v_dot4c_i32_i8 v2, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 false) @@ -78,7 +78,7 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) { ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: v_or3_b32 v1, v3, v4, v5 -; GFX10-NEXT: v_dot4c_i32_i8_e32 v8, v0, v1 +; GFX10-NEXT: v_dot4c_i32_i8 v8, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %a.cast = bitcast <4 x i8> %a to i32 @@ -99,7 +99,7 @@ define i32 @v_sdot4_fnegf32_a(float %a, i32 %b, i32 %c) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX10-NEXT: v_dot4c_i32_i8_e32 v2, v0, v1 +; GFX10-NEXT: v_dot4c_i32_i8 v2, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg float %a @@ -120,7 +120,7 @@ define i32 @v_sdot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10-NEXT: v_dot4c_i32_i8_e32 v2, v0, v1 +; GFX10-NEXT: v_dot4c_i32_i8 v2, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a diff --git a/llvm/test/CodeGen/AMDGPU/fdot2.ll b/llvm/test/CodeGen/AMDGPU/fdot2.ll index 8573cd4d1fe13..b7baad61651f7 100644 --- a/llvm/test/CodeGen/AMDGPU/fdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/fdot2.ll @@ -54,7 +54,7 @@ entry: ; GFX906: v_mac_f32_e32 ; GFX906-DL-UNSAFE: v_dot2_f32_f16 -; GFX10-DL-UNSAFE: v_dot2c_f32_f16_e32 +; GFX10-DL-UNSAFE: v_dot2c_f32_f16 ; GFX906-CONTRACT: v_dot2_f32_f16 @@ -95,7 +95,7 @@ entry: ; GFX906: v_mac_f32_e32 ; GFX906-DL-UNSAFE: v_dot2_f32_f16 -; GFX10-DL-UNSAFE: v_dot2c_f32_f16_e32 +; GFX10-DL-UNSAFE: v_dot2c_f32_f16 ; GFX906-CONTRACT: v_dot2_f32_f16 ; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll index 56f72ac9d9e8c..9da07ea04ded5 100644 --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -2855,7 +2855,7 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0001 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-DL-NEXT: v_dot4c_i32_i8_e32 v2, v1, v0 +; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0 ; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index 5c44ba008df04..fdd913867c8f8 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -127,7 +127,7 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_dot4c_i32_i8_e32 v0, v1, v2 +; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; @@ -336,7 +336,7 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: global_load_sshort v4, v1, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_dot4c_i32_i8_e32 v4, v2, v3 +; GFX10-DL-NEXT: v_dot4c_i32_i8 v4, v2, v3 ; GFX10-DL-NEXT: global_store_short v1, v4, s[2:3] ; GFX10-DL-NEXT: s_endpgm ; @@ -710,7 +710,7 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v3, s2 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-DL-NEXT: v_dot4c_i32_i8_e32 v0, v1, v2 +; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; @@ -906,7 +906,7 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_dot4c_i32_i8_e32 v0, v1, v2 +; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; @@ -1335,7 +1335,7 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0100 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-DL-NEXT: v_dot4c_i32_i8_e32 v2, v1, v0 +; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0 ; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; @@ -1513,7 +1513,7 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-DL-NEXT: v_dot4c_i32_i8_e32 v2, v1, v0 +; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0 ; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; @@ -1698,7 +1698,7 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020003 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-DL-NEXT: v_dot4c_i32_i8_e32 v2, v1, v0 +; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0 ; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; @@ -1870,7 +1870,7 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_dot4c_i32_i8_e32 v0, v1, v2 +; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; @@ -2070,7 +2070,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-DL-NEXT: v_dot4c_i32_i8_e32 v1, v3, v0 +; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v3, v0 ; GFX10-DL-NEXT: global_store_dword v2, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; @@ -2276,7 +2276,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-DL-NEXT: v_dot4c_i32_i8_e32 v1, v2, v0 +; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v2, v0 ; GFX10-DL-NEXT: global_store_dword v3, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; @@ -2479,7 +2479,7 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, s2, s3 -; GFX10-DL-NEXT: v_dot4c_i32_i8_e32 v0, v1, v2 +; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; @@ -2674,7 +2674,7 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-DL-NEXT: v_dot4c_i32_i8_e32 v2, v1, v0 +; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0 ; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; @@ -2874,7 +2874,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-DL-NEXT: v_dot4c_i32_i8_e32 v1, v2, v0 +; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v2, v0 ; GFX10-DL-NEXT: global_store_dword v3, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ; @@ -3105,7 +3105,7 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-DL-NEXT: v_dot4c_i32_i8_e32 v2, v1, v0 +; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0 ; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1] ; GFX10-DL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll index 3ced3765b9143..fdf1b7db42652 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll @@ -26,7 +26,7 @@ entry: ; GCN-LABEL: {{^}}test_llvm_amdgcn_fdot2_no_clamp ; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX940: v_dot2c_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX10: {{v_dot2c_f32_f16_e32|v_dot2acc_f32_f16}} v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX10: {{v_dot2c_f32_f16|v_dot2acc_f32_f16}} v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} define amdgpu_kernel void @test_llvm_amdgcn_fdot2_no_clamp( ptr addrspace(1) %r, ptr addrspace(1) %a, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll index 08dbe29c5de4e..7770fc02d5070 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll @@ -29,7 +29,7 @@ entry: ; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_no_clamp ; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX10: v_dot4c_i32_i8_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX10: v_dot4c_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GF11: v_dot4_i32_iu8 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} neg_lo:[1,1,0]{{$}} define amdgpu_kernel void @test_llvm_amdgcn_sdot4_no_clamp( ptr addrspace(1) %r, diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1011-xdl-insts.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1011-xdl-insts.txt index 914b6a7db7dde..7397316bbf92b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1011-xdl-insts.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1011-xdl-insts.txt @@ -1,10 +1,10 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1011 -disassemble -show-encoding < %s | FileCheck %s # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1012 -disassemble -show-encoding < %s | FileCheck %s -# CHECK: v_dot2c_f32_f16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x04] +# CHECK: v_dot2c_f32_f16 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x04] 0x01,0x05,0x0a,0x04 -# CHECK: v_dot2c_f32_f16_e32 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x05] +# CHECK: v_dot2c_f32_f16 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x05] 0x01,0x05,0xfe,0x05 # CHECK: v_dot2c_f32_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0x00] @@ -85,10 +85,10 @@ # CHECK: v_dot2c_f32_f16_dpp v5, v1, |v2| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x80,0x00] 0xfa,0x04,0x0a,0x04,0x01,0xe4,0x80,0x00 -# CHECK: v_dot4c_i32_i8_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x1a] +# CHECK: v_dot4c_i32_i8 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x1a] 0x01,0x05,0x0a,0x1a -# CHECK: v_dot4c_i32_i8_e32 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x1b] +# CHECK: v_dot4c_i32_i8 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x1b] 0x01,0x05,0xfe,0x1b # CHECK: v_dot4c_i32_i8_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x1a,0x01,0xe4,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1011_dlops.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1011_dlops.txt index 972673542f4ce..4689a40e936e4 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1011_dlops.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1011_dlops.txt @@ -29,7 +29,7 @@ # GFX10: v_dot8_u32_u4 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x19,0xcc,0x01,0x05,0x0e,0x1c] 0x00,0x40,0x19,0xcc,0x01,0x05,0x0e,0x1c -# GFX10: v_dot2c_f32_f16_e32 v5, v1, v2 +# GFX10: v_dot2c_f32_f16 v5, v1, v2 0x01,0x05,0x0a,0x04 # GFX10: v_dot2c_f32_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 @@ -44,7 +44,7 @@ # GFX10: v_dot2c_f32_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 0xea,0x04,0x0a,0x04,0x01,0x77,0x39,0x05 -# GFX10: v_dot4c_i32_i8_e32 v5, v1, v2 +# GFX10: v_dot4c_i32_i8 v5, v1, v2 0x01,0x05,0x0a,0x1a # GFX10: v_dot4c_i32_i8_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0