52 changes: 27 additions & 25 deletions llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=NOSDWA,GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GFX89,SDWA,GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_10,SDWA,GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX9_10,SDWA,GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=NOSDWA,GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GFX89,SDWA,GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_10,SDWA,GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX9_10,SDWA,GCN %s

; GCN-LABEL: {{^}}add_shr_i32:
; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
Expand All @@ -12,7 +12,7 @@
; GFX9: v_add_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10: v_add_nc_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1

define amdgpu_kernel void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
define amdgpu_kernel void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
%a = load i32, i32 addrspace(1)* %in, align 4
%shr = lshr i32 %a, 16
%add = add i32 %a, %shr
Expand All @@ -28,7 +28,7 @@ define amdgpu_kernel void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
; VI: v_subrev_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9: v_sub_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10: v_sub_nc_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
%a = load i32, i32 addrspace(1)* %in, align 4
%shr = lshr i32 %a, 16
%sub = sub i32 %shr, %a
Expand All @@ -44,7 +44,7 @@ define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)*

; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1

define amdgpu_kernel void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) {
define amdgpu_kernel void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) #0 {
%a = load i32, i32 addrspace(1)* %in1, align 4
%b = load i32, i32 addrspace(1)* %in2, align 4
%shra = lshr i32 %a, 16
Expand All @@ -61,7 +61,7 @@ define amdgpu_kernel void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
; GFX10: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; SDWA-NOT: v_mul_u32_u24_sdwa

define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) {
define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) #0 {
entry:
%a = load i16, i16 addrspace(1)* %ina, align 4
%b = load i16, i16 addrspace(1)* %inb, align 4
Expand All @@ -84,7 +84,7 @@ entry:

; GFX9_10: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}

define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) #0 {
entry:
%a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
%b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
Expand All @@ -111,7 +111,7 @@ entry:
; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}

define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) {
define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) #0 {
entry:
%a = load <4 x i16>, <4 x i16> addrspace(1)* %ina, align 4
%b = load <4 x i16>, <4 x i16> addrspace(1)* %inb, align 4
Expand Down Expand Up @@ -146,7 +146,7 @@ entry:
; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}

define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) {
define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) #0 {
entry:
%a = load <8 x i16>, <8 x i16> addrspace(1)* %ina, align 4
%b = load <8 x i16>, <8 x i16> addrspace(1)* %inb, align 4
Expand All @@ -161,7 +161,7 @@ entry:
; SDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; SDWA-NOT: v_mul_f16_sdwa

define amdgpu_kernel void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) {
define amdgpu_kernel void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) #0 {
entry:
%a = load half, half addrspace(1)* %ina, align 4
%b = load half, half addrspace(1)* %inb, align 4
Expand All @@ -184,7 +184,7 @@ entry:

; GFX9_10: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}

define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) #0 {
entry:
%a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
%b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
Expand All @@ -209,7 +209,7 @@ entry:
; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}

define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) {
define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) #0 {
entry:
%a = load <4 x half>, <4 x half> addrspace(1)* %ina, align 4
%b = load <4 x half>, <4 x half> addrspace(1)* %inb, align 4
Expand Down Expand Up @@ -240,7 +240,7 @@ entry:
; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}

define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) {
define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) #0 {
entry:
%a = load <8 x half>, <8 x half> addrspace(1)* %ina, align 4
%b = load <8 x half>, <8 x half> addrspace(1)* %inb, align 4
Expand All @@ -256,7 +256,7 @@ entry:
; GFX10: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; SDWA-NOT: v_mul_u32_u24_sdwa

define amdgpu_kernel void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) {
define amdgpu_kernel void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) #0 {
entry:
%a = load i8, i8 addrspace(1)* %ina, align 4
%b = load i8, i8 addrspace(1)* %inb, align 4
Expand Down Expand Up @@ -285,7 +285,7 @@ entry:

; GFX10: v_lshlrev_b16_e64 v{{[0-9]+}}, 8, v
; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) {
define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) #0 {
entry:
%a = load <2 x i8>, <2 x i8> addrspace(1)* %ina, align 4
%b = load <2 x i8>, <2 x i8> addrspace(1)* %inb, align 4
Expand Down Expand Up @@ -315,7 +315,7 @@ entry:
; GFX10-DAG: v_mul_lo_u16_e64
; GFX10-DAG: v_mul_lo_u16_e64

define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) {
define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) #0 {
entry:
%a = load <4 x i8>, <4 x i8> addrspace(1)* %ina, align 4
%b = load <4 x i8>, <4 x i8> addrspace(1)* %inb, align 4
Expand Down Expand Up @@ -355,7 +355,7 @@ entry:
; GFX10-DAG: v_mul_lo_u16_e64
; GFX10-DAG: v_mul_lo_u16_e64

define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) {
define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) #0 {
entry:
%a = load <8 x i8>, <8 x i8> addrspace(1)* %ina, align 4
%b = load <8 x i8>, <8 x i8> addrspace(1)* %inb, align 4
Expand All @@ -376,7 +376,7 @@ entry:
; FIXME: Should be able to avoid or
define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
<2 x half> addrspace(1)* %r,
<2 x i16> addrspace(1)* %a) {
<2 x i16> addrspace(1)* %a) #0 {
entry:
%a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
%r.val = sitofp <2 x i16> %a.val to <2 x half>
Expand All @@ -399,7 +399,7 @@ entry:
; GFX9_10: v_pk_mul_f16 v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v[[SRC:[0-9]+]]
; GFX9_10: v_pk_add_f16 v{{[0-9]+}}, v[[DST_MUL]], v[[SRC]]

define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) #0 {
entry:
%a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
%b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
Expand All @@ -421,7 +421,7 @@ entry:

; GFX10: v_pk_mul_lo_u16 v{{[0-9]+}}, 0x141007b, v{{[0-9]+}}

define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
entry:
%a = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
%mul = mul <2 x i16> %a, <i16 123, i16 321>
Expand All @@ -443,7 +443,7 @@ entry:
; GFX9_10: v_pk_mul_lo_u16 v[[DST1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
; GFX9_10: v_pk_mul_lo_u16 v{{[0-9]+}}, v[[DST1]], v{{[0-9]+}}

define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) #0 {
entry:
%a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
%b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
Expand All @@ -460,7 +460,7 @@ entry:

; GFX9_10: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}

define amdgpu_kernel void @add_bb_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
define amdgpu_kernel void @add_bb_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) #0 {
entry:
%a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
%b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
Expand Down Expand Up @@ -503,7 +503,7 @@ store_label:
; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD

define amdgpu_kernel void @pulled_out_test(<8 x i8> addrspace(1)* %sourceA, <8 x i8> addrspace(1)* %destValues) {
define amdgpu_kernel void @pulled_out_test(<8 x i8> addrspace(1)* %sourceA, <8 x i8> addrspace(1)* %destValues) #0 {
entry:
%idxprom = ashr exact i64 15, 32
%arrayidx = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %sourceA, i64 %idxprom
Expand Down Expand Up @@ -564,3 +564,5 @@ bb11: ; preds = %bb10, %bb2
store volatile <2 x i32> %tmp12, <2 x i32> addrspace(1)* undef
br label %bb1
}

attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" }
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/udiv.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s

; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s

; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s

Expand Down
61 changes: 61 additions & 0 deletions llvm/test/CodeGen/AMDGPU/udivrem24.ll
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,63 @@ define amdgpu_kernel void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in
ret void
}

; FUNC-LABEL: {{^}}udiv24_i8_denorm_flush_in_out:
; SI: v_cvt_f32_ubyte
; SI-DAG: v_cvt_f32_ubyte
; SI-DAG: v_rcp_iflag_f32
; SI: v_cvt_u32_f32

; EG: UINT_TO_FLT
; EG-DAG: UINT_TO_FLT
; EG-DAG: RECIP_IEEE
; EG: FLT_TO_UINT
define amdgpu_kernel void @udiv24_i8_denorm_flush_in_out(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
%den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
%num = load i8, i8 addrspace(1) * %in
%den = load i8, i8 addrspace(1) * %den_ptr
%result = udiv i8 %num, %den
store i8 %result, i8 addrspace(1)* %out
ret void
}

; FUNC-LABEL: {{^}}udiv24_i8_denorm_flush_in:
; SI: v_cvt_f32_ubyte
; SI-DAG: v_cvt_f32_ubyte
; SI-DAG: v_rcp_iflag_f32
; SI: v_cvt_u32_f32

; EG: UINT_TO_FLT
; EG-DAG: UINT_TO_FLT
; EG-DAG: RECIP_IEEE
; EG: FLT_TO_UINT
define amdgpu_kernel void @udiv24_i8_denorm_flush_in(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
%den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
%num = load i8, i8 addrspace(1) * %in
%den = load i8, i8 addrspace(1) * %den_ptr
%result = udiv i8 %num, %den
store i8 %result, i8 addrspace(1)* %out
ret void
}

; FUNC-LABEL: {{^}}udiv24_i8_denorm_flush_out:
; SI: v_cvt_f32_ubyte
; SI-DAG: v_cvt_f32_ubyte
; SI-DAG: v_rcp_iflag_f32
; SI: v_cvt_u32_f32

; EG: UINT_TO_FLT
; EG-DAG: UINT_TO_FLT
; EG-DAG: RECIP_IEEE
; EG: FLT_TO_UINT
define amdgpu_kernel void @udiv24_i8_denorm_flush_out(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #2 {
%den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
%num = load i8, i8 addrspace(1) * %in
%den = load i8, i8 addrspace(1) * %den_ptr
%result = udiv i8 %num, %den
store i8 %result, i8 addrspace(1)* %out
ret void
}

; FUNC-LABEL: {{^}}udiv24_i16:
; SI: v_cvt_f32_u32
; SI: v_cvt_f32_u32
Expand Down Expand Up @@ -325,3 +382,7 @@ define amdgpu_kernel void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 a
store i32 %result, i32 addrspace(1)* %out, align 4
ret void
}

attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #1 = { "denormal-fp-math-f32"="ieee,preserve-sign" }
attributes #2 = { "denormal-fp-math-f32"="preserve-sign,ieee" }
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/v_mac.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s

; GCN-LABEL: {{^}}mac_vvv:
; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -mattr=-fp32-denormals,-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp32-denormals,-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI %s

; GCN-LABEL: {{^}}mac_f16:
; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
Expand Down Expand Up @@ -677,6 +677,6 @@ entry:

declare void @llvm.amdgcn.s.barrier() #2

attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" }
attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" }
attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" }
attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" "denormal-fp-math"="preserve-sign,preserve-sign" }
attributes #2 = { nounwind convergent }
10 changes: 6 additions & 4 deletions llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -mattr=-fp32-denormals,-fp64-fp16-denormals -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI
; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-fp32-denormals,-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI
; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI
; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI

define amdgpu_kernel void @madak_f16(
; SI-LABEL: madak_f16:
Expand Down Expand Up @@ -52,7 +52,7 @@ define amdgpu_kernel void @madak_f16(
; VI-NEXT: s_endpgm
half addrspace(1)* %r,
half addrspace(1)* %a,
half addrspace(1)* %b) {
half addrspace(1)* %b) #0 {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
Expand Down Expand Up @@ -136,7 +136,7 @@ define amdgpu_kernel void @madak_f16_use_2(
half addrspace(1)* %r1,
half addrspace(1)* %a,
half addrspace(1)* %b,
half addrspace(1)* %c) {
half addrspace(1)* %c) #0 {
entry:
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
Expand All @@ -151,3 +151,5 @@ entry:
store half %r1.val, half addrspace(1)* %r1
ret void
}

attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" }
51 changes: 18 additions & 33 deletions llvm/test/Transforms/Inline/AMDGPU/inline-target-cpu.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,89 +15,74 @@ define i32 @target_cpu_call_no_target_cpu() #1 {

; CHECK-LABEL: @target_cpu_target_features_call_no_target_cpu(
; CHECK-NEXT: ret i32 0
define i32 @target_cpu_target_features_call_no_target_cpu() #2 {
define i32 @target_cpu_target_features_call_no_target_cpu() {
%call = call i32 @func_no_target_cpu()
ret i32 %call
}

; CHECK-LABEL: @fp32_denormals(
define i32 @fp32_denormals() #3 {
ret i32 0
}

; CHECK-LABEL: @no_fp32_denormals_call_f32_denormals(
; CHECK-NEXT: call i32 @fp32_denormals()
define i32 @no_fp32_denormals_call_f32_denormals() #4 {
%call = call i32 @fp32_denormals()
ret i32 %call
}

; Make sure gfx9 can call unspecified functions because of movrel
; feature change.
; CHECK-LABEL: @gfx9_target_features_call_no_target_cpu(
; CHECK-NEXT: ret i32 0
define i32 @gfx9_target_features_call_no_target_cpu() #5 {
define i32 @gfx9_target_features_call_no_target_cpu() #2 {
%call = call i32 @func_no_target_cpu()
ret i32 %call
}

define i32 @func_no_halfrate64ops() #6 {
define i32 @func_no_halfrate64ops() #3 {
ret i32 0
}

define i32 @func_with_halfrate64ops() #7 {
define i32 @func_with_halfrate64ops() #4 {
ret i32 0
}

; CHECK-LABEL: @call_func_without_halfrate64ops(
; CHECK-NEXT: ret i32 0
define i32 @call_func_without_halfrate64ops() #7 {
define i32 @call_func_without_halfrate64ops() #4 {
%call = call i32 @func_no_halfrate64ops()
ret i32 %call
}

; CHECK-LABEL: @call_func_with_halfrate64ops(
; CHECK-NEXT: ret i32 0
define i32 @call_func_with_halfrate64ops() #6 {
define i32 @call_func_with_halfrate64ops() #3 {
%call = call i32 @func_with_halfrate64ops()
ret i32 %call
}

define i32 @func_no_loadstoreopt() #8 {
define i32 @func_no_loadstoreopt() #5 {
ret i32 0
}

define i32 @func_with_loadstoreopt() #9 {
define i32 @func_with_loadstoreopt() #6 {
ret i32 0
}

; CHECK-LABEL: @call_func_without_loadstoreopt(
; CHECK-NEXT: ret i32 0
define i32 @call_func_without_loadstoreopt() #9 {
define i32 @call_func_without_loadstoreopt() #6 {
%call = call i32 @func_no_loadstoreopt()
ret i32 %call
}

define i32 @enable_codeobjectv3() #10 {
define i32 @enable_codeobjectv3() #7 {
ret i32 999
}

; CHECK-LABEL: @disable_codeobjectv3_call_codeobjectv3(
; CHECK-NEXT: ret i32 999
define i32 @disable_codeobjectv3_call_codeobjectv3() #11 {
define i32 @disable_codeobjectv3_call_codeobjectv3() #8 {
%call = call i32 @enable_codeobjectv3()
ret i32 %call
}

attributes #0 = { nounwind }
attributes #1 = { nounwind "target-cpu"="fiji" }
attributes #2 = { nounwind "target-cpu"="fiji" "target-features"="+fp32-denormals" }
attributes #3 = { nounwind "target-features"="+fp32-denormals" }
attributes #4 = { nounwind "target-features"="-fp32-denormals" }
attributes #5 = { nounwind "target-cpu"="gfx900" }
attributes #6 = { nounwind "target-features"="-half-rate-64-ops" }
attributes #7 = { nounwind "target-features"="+half-rate-64-ops" }
attributes #8 = { nounwind "target-features"="-load-store-opt" }
attributes #9 = { nounwind "target-features"="+load-store-opt" }
attributes #10 = { nounwind "target-features"="+code-object-v3" }
attributes #11 = { nounwind "target-features"="-code-object-v3" }
attributes #2 = { nounwind "target-cpu"="gfx900" }
attributes #3 = { nounwind "target-features"="-half-rate-64-ops" }
attributes #4 = { nounwind "target-features"="+half-rate-64-ops" }
attributes #5 = { nounwind "target-features"="-load-store-opt" }
attributes #6 = { nounwind "target-features"="+load-store-opt" }
attributes #7 = { nounwind "target-features"="+code-object-v3" }
attributes #8 = { nounwind "target-features"="-code-object-v3" }
2 changes: 1 addition & 1 deletion llvm/test/tools/llvm-objdump/ELF/AMDGPU/source-lines.ll
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ entry:
; Function Attrs: nounwind readnone
declare void @llvm.dbg.declare(metadata, metadata, metadata) #1

attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fp64-fp16-denormals,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }

!llvm.dbg.cu = !{!0}
Expand Down