Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
313 changes: 281 additions & 32 deletions llvm/test/CodeGen/AMDGPU/mad_int24.ll
Original file line number Diff line number Diff line change
@@ -1,17 +1,79 @@
; RUN: llc < %s -mtriple=amdgcn | FileCheck %s --check-prefix=GCN --check-prefix=FUNC
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s --check-prefix=GCN --check-prefix=FUNC
; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -mtriple=amdgcn| FileCheck %s --check-prefixes=GCN
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s --check-prefixes=VI
; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefixes=EG,R600,RW
; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s --check-prefixes=EG,R600,CM

; FUNC-LABEL: {{^}}i32_mad24:
; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
; EG: MULLO_INT
; CM: MULLO_INT
; GCN: s_bfe_i32
; GCN: s_bfe_i32
; GCN: s_mul_i32
; GCN: s_add_i32
define amdgpu_kernel void @i32_mad24(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: i32_mad24:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bfe_i32 s0, s0, 0x180000
; GCN-NEXT: s_bfe_i32 s1, s1, 0x180000
; GCN-NEXT: s_mul_i32 s0, s0, s1
; GCN-NEXT: s_add_i32 s0, s0, s2
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
; VI-LABEL: i32_mad24:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i32 s0, s0, 0x180000
; VI-NEXT: s_bfe_i32 s1, s1, 0x180000
; VI-NEXT: s_mul_i32 s0, s0, s1
; VI-NEXT: s_add_i32 s0, s0, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; RW-LABEL: i32_mad24:
; RW: ; %bb.0: ; %entry
; RW-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
; RW-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; RW-NEXT: CF_END
; RW-NEXT: PAD
; RW-NEXT: ALU clause starting at 4:
; RW-NEXT: LSHL T0.W, KC0[2].Z, literal.x,
; RW-NEXT: LSHL * T1.W, KC0[2].W, literal.x,
; RW-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; RW-NEXT: ASHR T1.W, PS, literal.x,
; RW-NEXT: ASHR * T0.W, PV.W, literal.x,
; RW-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; RW-NEXT: MULLO_INT * T0.X, PS, PV.W,
; RW-NEXT: ADD_INT T0.X, PS, KC0[3].X,
; RW-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; RW-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: i32_mad24:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHL T0.Z, KC0[2].Z, literal.x,
; CM-NEXT: LSHL * T0.W, KC0[2].W, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: ASHR T1.Z, PV.W, literal.x,
; CM-NEXT: ASHR * T0.W, PV.Z, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: MULLO_INT T0.X, T0.W, T1.Z,
; CM-NEXT: MULLO_INT T0.Y (MASKED), T0.W, T1.Z,
; CM-NEXT: MULLO_INT T0.Z (MASKED), T0.W, T1.Z,
; CM-NEXT: MULLO_INT * T0.W (MASKED), T0.W, T1.Z,
; CM-NEXT: ADD_INT * T0.X, PV.X, KC0[3].X,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%0 = shl i32 %a, 8
%a_24 = ashr i32 %0, 8
Expand All @@ -23,13 +85,25 @@ entry:
ret void
}

; GCN-LABEL: {{^}}mad24_known_bits_destroyed:
; GCN: s_waitcnt
; GCN-NEXT: v_mad_i32_i24
; GCN-NEXT: v_mul_i32_i24
; GCN-NEXT: s_setpc_b64
define i32 @mad24_known_bits_destroyed(i32 %a, i32 %b, i32 %c) {

; GCN-LABEL: mad24_known_bits_destroyed:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mad_i32_i24 v1, v0, v1, v2
; GCN-NEXT: v_mul_i32_i24_e32 v0, v1, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: mad24_known_bits_destroyed:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mad_i32_i24 v1, v0, v1, v2
; VI-NEXT: v_mul_i32_i24_e32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: mad24_known_bits_destroyed:
; EG: ; %bb.0:
; EG-NEXT: CF_END
; EG-NEXT: PAD
%shl.0 = shl i32 %a, 8
%sra.0 = ashr i32 %shl.0, 8
%shl.1 = shl i32 %b, 8
Expand All @@ -48,12 +122,25 @@ define i32 @mad24_known_bits_destroyed(i32 %a, i32 %b, i32 %c) {
ret i32 %mul1
}

; GCN-LABEL: {{^}}mad24_intrin_known_bits_destroyed:
; GCN: s_waitcnt
; GCN-NEXT: v_mad_i32_i24
; GCN-NEXT: v_mul_i32_i24
; GCN-NEXT: s_setpc_b64
define i32 @mad24_intrin_known_bits_destroyed(i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: mad24_intrin_known_bits_destroyed:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mad_i32_i24 v1, v0, v1, v2
; GCN-NEXT: v_mul_i32_i24_e32 v0, v1, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: mad24_intrin_known_bits_destroyed:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mad_i32_i24 v1, v0, v1, v2
; VI-NEXT: v_mul_i32_i24_e32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: mad24_intrin_known_bits_destroyed:
; EG: ; %bb.0:
; EG-NEXT: CF_END
; EG-NEXT: PAD
%shl.0 = shl i32 %a, 8
%sra.0 = ashr i32 %shl.0, 8
%shl.1 = shl i32 %b, 8
Expand All @@ -73,17 +160,177 @@ define i32 @mad24_intrin_known_bits_destroyed(i32 %a, i32 %b, i32 %c) {
}

; Make sure no unnecessary BFEs are emitted in the loop.
; GCN-LABEL: {{^}}mad24_destroyed_knownbits_2:
; GCN-NOT: v_bfe
; GCN: v_mad_i32_i24
; GCN-NOT: v_bfe
; GCN: v_mad_i32_i24
; GCN-NOT: v_bfe
; GCN: v_mad_i32_i24
; GCN-NOT: v_bfe
; GCN: v_mad_i32_i24
; GCN-NOT: v_bfe
define void @mad24_destroyed_knownbits_2(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(1) %arg3) {
; GCN-LABEL: mad24_destroyed_knownbits_2:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v5, 1
; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: .LBB3_1: ; %bb6
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_mad_i32_i24 v0, v0, v5, v5
; GCN-NEXT: v_add_i32_e32 v1, vcc, -1, v1
; GCN-NEXT: v_mad_i32_i24 v5, v0, v5, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GCN-NEXT: v_mad_i32_i24 v0, v5, v0, v5
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GCN-NEXT: v_mad_i32_i24 v0, v0, v5, v0
; GCN-NEXT: v_mov_b32_e32 v5, v2
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-NEXT: s_cbranch_execnz .LBB3_1
; GCN-NEXT: ; %bb.2: ; %bb5
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: mad24_destroyed_knownbits_2:
; VI: ; %bb.0: ; %bb
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, 1
; VI-NEXT: s_mov_b64 s[4:5], 0
; VI-NEXT: .LBB3_1: ; %bb6
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mad_i32_i24 v0, v0, v5, v5
; VI-NEXT: v_mad_i32_i24 v5, v0, v5, v0
; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v1
; VI-NEXT: v_mad_i32_i24 v0, v5, v0, v5
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; VI-NEXT: v_mad_i32_i24 v0, v0, v5, v0
; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; VI-NEXT: v_mov_b32_e32 v5, v2
; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
; VI-NEXT: s_cbranch_execnz .LBB3_1
; VI-NEXT: ; %bb.2: ; %bb5
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: flat_store_dword v[3:4], v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; RW-LABEL: mad24_destroyed_knownbits_2:
; RW: ; %bb.0: ; %bb
; RW-NEXT: ALU 5, @10, KC0[CB0:0-32], KC1[]
; RW-NEXT: LOOP_START_DX10 @7
; RW-NEXT: ALU_PUSH_BEFORE 30, @16, KC0[], KC1[]
; RW-NEXT: JUMP @6 POP:1
; RW-NEXT: LOOP_BREAK @6
; RW-NEXT: POP @6 POP:1
; RW-NEXT: END_LOOP @2
; RW-NEXT: ALU 1, @47, KC0[], KC1[]
; RW-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; RW-NEXT: CF_END
; RW-NEXT: ALU clause starting at 10:
; RW-NEXT: MOV T0.X, KC0[2].Y,
; RW-NEXT: MOV T0.Y, KC0[2].Z,
; RW-NEXT: MOV * T0.Z, KC0[2].W,
; RW-NEXT: MOV T0.W, KC0[3].X,
; RW-NEXT: MOV * T1.W, literal.x,
; RW-NEXT: 1(1.401298e-45), 0(0.000000e+00)
; RW-NEXT: ALU clause starting at 16:
; RW-NEXT: LSHL T2.W, T1.W, literal.x,
; RW-NEXT: LSHL * T3.W, T0.X, literal.x,
; RW-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; RW-NEXT: ASHR T3.W, PS, literal.x,
; RW-NEXT: ASHR * T2.W, PV.W, literal.x,
; RW-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; RW-NEXT: MULLO_INT * T0.X, PV.W, PS,
; RW-NEXT: ADD_INT * T1.W, PS, T1.W,
; RW-NEXT: LSHL * T3.W, PV.W, literal.x,
; RW-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; RW-NEXT: ASHR * T3.W, PV.W, literal.x,
; RW-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; RW-NEXT: MULLO_INT * T0.X, PV.W, T2.W,
; RW-NEXT: ADD_INT * T1.W, PS, T1.W,
; RW-NEXT: LSHL * T2.W, PV.W, literal.x,
; RW-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; RW-NEXT: ASHR * T2.W, PV.W, literal.x,
; RW-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; RW-NEXT: MULLO_INT * T0.X, PV.W, T3.W,
; RW-NEXT: ADD_INT * T1.W, PS, T1.W,
; RW-NEXT: LSHL * T3.W, PV.W, literal.x,
; RW-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; RW-NEXT: ASHR * T3.W, PV.W, literal.x,
; RW-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; RW-NEXT: ADD_INT T0.Y, T0.Y, literal.x,
; RW-NEXT: MULLO_INT * T0.X, PV.W, T2.W,
; RW-NEXT: -1(nan), 0(0.000000e+00)
; RW-NEXT: ADD_INT T0.X, PS, T1.W,
; RW-NEXT: SETE_INT T2.W, PV.Y, 0.0,
; RW-NEXT: MOV * T1.W, T0.Z,
; RW-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
; RW-NEXT: ALU clause starting at 47:
; RW-NEXT: LSHR * T1.X, T0.W, literal.x,
; RW-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: mad24_destroyed_knownbits_2:
; CM: ; %bb.0: ; %bb
; CM-NEXT: ALU 5, @10, KC0[CB0:0-32], KC1[]
; CM-NEXT: LOOP_START_DX10 @7
; CM-NEXT: ALU_PUSH_BEFORE 41, @16, KC0[], KC1[]
; CM-NEXT: JUMP @6 POP:1
; CM-NEXT: LOOP_BREAK @6
; CM-NEXT: POP @6 POP:1
; CM-NEXT: END_LOOP @2
; CM-NEXT: ALU 1, @58, KC0[], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: ALU clause starting at 10:
; CM-NEXT: MOV * T1.X, KC0[2].Y,
; CM-NEXT: MOV T0.X, KC0[2].Z,
; CM-NEXT: MOV T0.Y, KC0[2].W,
; CM-NEXT: MOV T0.Z, KC0[3].X,
; CM-NEXT: MOV * T0.W, literal.x,
; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00)
; CM-NEXT: ALU clause starting at 16:
; CM-NEXT: LSHL T1.Z, T0.W, literal.x,
; CM-NEXT: LSHL * T1.W, T1.X, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: ASHR T2.Z, PV.W, literal.x,
; CM-NEXT: ASHR * T1.W, PV.Z, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: MULLO_INT T1.X, T2.Z, T1.W,
; CM-NEXT: MULLO_INT T1.Y (MASKED), T2.Z, T1.W,
; CM-NEXT: MULLO_INT T1.Z (MASKED), T2.Z, T1.W,
; CM-NEXT: MULLO_INT * T1.W (MASKED), T2.Z, T1.W,
; CM-NEXT: ADD_INT * T0.W, PV.X, T0.W,
; CM-NEXT: LSHL * T2.W, PV.W, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: ASHR * T2.W, PV.W, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: MULLO_INT T1.X, T2.W, T1.W,
; CM-NEXT: MULLO_INT T1.Y (MASKED), T2.W, T1.W,
; CM-NEXT: MULLO_INT T1.Z (MASKED), T2.W, T1.W,
; CM-NEXT: MULLO_INT * T1.W (MASKED), T2.W, T1.W,
; CM-NEXT: ADD_INT * T0.W, PV.X, T0.W,
; CM-NEXT: LSHL * T1.W, PV.W, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: ASHR * T1.W, PV.W, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: MULLO_INT T1.X, T1.W, T2.W,
; CM-NEXT: MULLO_INT T1.Y (MASKED), T1.W, T2.W,
; CM-NEXT: MULLO_INT T1.Z (MASKED), T1.W, T2.W,
; CM-NEXT: MULLO_INT * T1.W (MASKED), T1.W, T2.W,
; CM-NEXT: ADD_INT * T0.W, PV.X, T0.W,
; CM-NEXT: LSHL * T2.W, PV.W, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: ADD_INT T0.X, T0.X, literal.x,
; CM-NEXT: ASHR * T2.W, PV.W, literal.y,
; CM-NEXT: -1(nan), 8(1.121039e-44)
; CM-NEXT: MULLO_INT T1.X, T2.W, T1.W,
; CM-NEXT: MULLO_INT T1.Y (MASKED), T2.W, T1.W,
; CM-NEXT: MULLO_INT T1.Z (MASKED), T2.W, T1.W,
; CM-NEXT: MULLO_INT * T1.W (MASKED), T2.W, T1.W,
; CM-NEXT: ADD_INT T1.X, PV.X, T0.W,
; CM-NEXT: SETE_INT T1.Z, T0.X, 0.0,
; CM-NEXT: MOV * T0.W, T0.Y,
; CM-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.Z, 0.0,
; CM-NEXT: ALU clause starting at 58:
; CM-NEXT: LSHR * T0.X, T0.Z, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
bb:
br label %bb6

Expand Down Expand Up @@ -119,3 +366,5 @@ bb6: ; preds = %bb6, %bb
}

declare i32 @llvm.amdgcn.mul.i24(i32, i32)
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; R600: {{.*}}
Loading