Skip to content

Commit

Permalink
[AMDGPU] Add more llc tests for 48-bit mul generation.
Browse files Browse the repository at this point in the history
Differential Revision: https://reviews.llvm.org/D112554
  • Loading branch information
Abinav Puthan Purayil committed Oct 28, 2021
1 parent 513914e commit fa59218
Show file tree
Hide file tree
Showing 2 changed files with 192 additions and 6 deletions.
132 changes: 126 additions & 6 deletions llvm/test/CodeGen/AMDGPU/mul_int24.ll
Expand Up @@ -177,6 +177,126 @@ entry:
ret void
}

define i64 @test_smul48_i64(i64 %lhs, i64 %rhs) {
; SI-LABEL: test_smul48_i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_ashr_i64 v[3:4], v[0:1], 40
; SI-NEXT: v_ashr_i64 v[1:2], v[1:2], 40
; SI-NEXT: v_mul_i32_i24_e32 v0, v3, v1
; SI-NEXT: v_mul_hi_i32_i24_e32 v1, v3, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: test_smul48_i64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; VI-NEXT: v_ashrrev_i64 v[3:4], 40, v[0:1]
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
; VI-NEXT: v_ashrrev_i64 v[1:2], 40, v[0:1]
; VI-NEXT: v_mul_i32_i24_e32 v0, v3, v1
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, v3, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_smul48_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; GFX9-NEXT: v_ashrrev_i64 v[3:4], 40, v[0:1]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2
; GFX9-NEXT: v_ashrrev_i64 v[1:2], 40, v[0:1]
; GFX9-NEXT: v_mul_i32_i24_e32 v0, v3, v1
; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v3, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: test_smul48_i64:
; EG: ; %bb.0:
; EG-NEXT: CF_END
; EG-NEXT: PAD
;
; CM-LABEL: test_smul48_i64:
; CM: ; %bb.0:
; CM-NEXT: CF_END
; CM-NEXT: PAD
%shl.lhs = shl i64 %lhs, 40
%lhs24 = ashr i64 %shl.lhs, 40
%shl.rhs = shl i64 %rhs, 40
%rhs24 = ashr i64 %shl.rhs, 40
%mul = mul i64 %lhs24, %rhs24
ret i64 %mul
}

define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; SI-LABEL: test_smul48_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v0
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v6
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; SI-NEXT: v_ashr_i64 v[5:6], v[0:1], 40
; SI-NEXT: v_ashr_i64 v[1:2], v[1:2], 40
; SI-NEXT: v_ashr_i64 v[6:7], v[2:3], 40
; SI-NEXT: v_ashr_i64 v[2:3], v[3:4], 40
; SI-NEXT: v_mul_i32_i24_e32 v0, v1, v2
; SI-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v2
; SI-NEXT: v_mul_i32_i24_e32 v2, v5, v6
; SI-NEXT: v_mul_hi_i32_i24_e32 v3, v5, v6
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: test_smul48_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
; VI-NEXT: v_ashrrev_i64 v[7:8], 40, v[0:1]
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; VI-NEXT: v_ashrrev_i64 v[1:2], 40, v[0:1]
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6
; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
; VI-NEXT: v_ashrrev_i64 v[3:4], 40, v[2:3]
; VI-NEXT: v_ashrrev_i64 v[4:5], 40, v[1:2]
; VI-NEXT: v_mul_i32_i24_e32 v0, v1, v3
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v3
; VI-NEXT: v_mul_i32_i24_e32 v2, v7, v4
; VI-NEXT: v_mul_hi_i32_i24_e32 v3, v7, v4
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_smul48_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2
; GFX9-NEXT: v_ashrrev_i64 v[7:8], 40, v[0:1]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; GFX9-NEXT: v_ashrrev_i64 v[1:2], 40, v[0:1]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v4
; GFX9-NEXT: v_ashrrev_i64 v[3:4], 40, v[2:3]
; GFX9-NEXT: v_ashrrev_i64 v[4:5], 40, v[1:2]
; GFX9-NEXT: v_mul_i32_i24_e32 v0, v1, v3
; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v3
; GFX9-NEXT: v_mul_i32_i24_e32 v2, v7, v4
; GFX9-NEXT: v_mul_hi_i32_i24_e32 v3, v7, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: test_smul48_v2i64:
; EG: ; %bb.0:
; EG-NEXT: CF_END
; EG-NEXT: PAD
;
; CM-LABEL: test_smul48_v2i64:
; CM: ; %bb.0:
; CM-NEXT: CF_END
; CM-NEXT: PAD
%shl.lhs = shl <2 x i64> %lhs, <i64 40, i64 40>
%lhs24 = ashr <2 x i64> %shl.lhs, <i64 40, i64 40>
%shl.rhs = shl <2 x i64> %rhs, <i64 40, i64 40>
%rhs24 = ashr <2 x i64> %shl.rhs, <i64 40, i64 40>
%mul = mul <2 x i64> %lhs24, %rhs24
ret <2 x i64> %mul
}

; This requires handling of the original 64-bit mul node to eliminate
; unnecessary extension instructions because after legalization they
; will not be removed by SimplifyDemandedBits because there are
Expand Down Expand Up @@ -588,10 +708,10 @@ define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s2, 0
; SI-NEXT: s_cbranch_scc0 BB6_2
; SI-NEXT: s_cbranch_scc0 BB8_2
; SI-NEXT: ; %bb.1: ; %bb7
; SI-NEXT: s_endpgm
; SI-NEXT: BB6_2: ; %bb11
; SI-NEXT: BB8_2: ; %bb11
; SI-NEXT: s_load_dword s2, s[0:1], 0xd
; SI-NEXT: s_load_dword s4, s[0:1], 0xf
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
Expand All @@ -611,10 +731,10 @@ define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc0 BB6_2
; VI-NEXT: s_cbranch_scc0 BB8_2
; VI-NEXT: ; %bb.1: ; %bb7
; VI-NEXT: s_endpgm
; VI-NEXT: BB6_2: ; %bb11
; VI-NEXT: BB8_2: ; %bb11
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x34
; VI-NEXT: s_load_dword s0, s[0:1], 0x3c
Expand All @@ -634,10 +754,10 @@ define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_cbranch_scc0 BB6_2
; GFX9-NEXT: s_cbranch_scc0 BB8_2
; GFX9-NEXT: ; %bb.1: ; %bb7
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: BB6_2: ; %bb11
; GFX9-NEXT: BB8_2: ; %bb11
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x3c
Expand Down
66 changes: 66 additions & 0 deletions llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
Expand Up @@ -571,6 +571,72 @@ entry:
ret void
}

define i64 @test_umul48_i64(i64 %lhs, i64 %rhs) {
; GCN-LABEL: test_umul48_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, 0xffffff
; GCN-NEXT: v_and_b32_e32 v1, s4, v0
; GCN-NEXT: v_and_b32_e32 v3, s4, v2
; GCN-NEXT: v_mul_u32_u24_e32 v0, v0, v2
; GCN-NEXT: v_mul_hi_u32_u24_e32 v1, v1, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
%lhs24 = and i64 %lhs, 16777215
%rhs24 = and i64 %rhs, 16777215
%mul = mul i64 %lhs24, %rhs24
ret i64 %mul
}

define <2 x i64> @test_umul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; SI-LABEL: test_umul48_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, 0xffffff
; SI-NEXT: v_mul_u32_u24_e32 v5, v0, v4
; SI-NEXT: v_mul_u32_u24_e32 v7, v2, v6
; SI-NEXT: v_and_b32_e32 v2, s4, v2
; SI-NEXT: v_and_b32_e32 v0, s4, v0
; SI-NEXT: v_and_b32_e32 v3, s4, v6
; SI-NEXT: v_and_b32_e32 v1, s4, v4
; SI-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v1
; SI-NEXT: v_mul_hi_u32_u24_e32 v3, v2, v3
; SI-NEXT: v_mov_b32_e32 v0, v5
; SI-NEXT: v_mov_b32_e32 v2, v7
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: test_umul48_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, 0xffffff
; VI-NEXT: v_and_b32_e32 v3, s4, v2
; VI-NEXT: v_and_b32_e32 v1, s4, v0
; VI-NEXT: v_and_b32_e32 v5, s4, v6
; VI-NEXT: v_and_b32_e32 v7, s4, v4
; VI-NEXT: v_mul_u32_u24_e32 v0, v0, v4
; VI-NEXT: v_mul_hi_u32_u24_e32 v1, v1, v7
; VI-NEXT: v_mul_u32_u24_e32 v2, v2, v6
; VI-NEXT: v_mul_hi_u32_u24_e32 v3, v3, v5
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_umul48_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0xffffff
; GFX9-NEXT: v_and_b32_e32 v3, s4, v2
; GFX9-NEXT: v_and_b32_e32 v1, s4, v0
; GFX9-NEXT: v_and_b32_e32 v5, s4, v6
; GFX9-NEXT: v_and_b32_e32 v7, s4, v4
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v0, v4
; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, v1, v7
; GFX9-NEXT: v_mul_u32_u24_e32 v2, v2, v6
; GFX9-NEXT: v_mul_hi_u32_u24_e32 v3, v3, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%lhs24 = and <2 x i64> %lhs, <i64 16777215, i64 16777215>
%rhs24 = and <2 x i64> %rhs, <i64 16777215, i64 16777215>
%mul = mul <2 x i64> %lhs24, %rhs24
ret <2 x i64> %mul
}

define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
; SI-LABEL: test_umul24_i64_square:
; SI: ; %bb.0: ; %entry
Expand Down

0 comments on commit fa59218

Please sign in to comment.