Skip to content

Commit

Permalink
[AMDGPU] Use v_max_f* for fcanonicalize
Browse files Browse the repository at this point in the history
If denorms are not flushed we can use max instead of multiplication
by 1. For double that is simply faster, while for float and half
it is shorter, because mul uses constant bus and VOP3.

Differential Revision: https://reviews.llvm.org/D36856

llvm-svn: 312095
  • Loading branch information
rampitec committed Aug 30, 2017
1 parent 5a2898a commit 06cab79
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 35 deletions.
9 changes: 6 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
Expand Up @@ -42,9 +42,12 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm = "",
field bits<32> Inst = 0xffffffff;
}

def FP16Denormals : Predicate<"Subtarget.hasFP16Denormals()">;
def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">;
def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">;
def FP16Denormals : Predicate<"Subtarget->hasFP16Denormals()">;
def FP32Denormals : Predicate<"Subtarget->hasFP32Denormals()">;
def FP64Denormals : Predicate<"Subtarget->hasFP64Denormals()">;
def NoFP16Denormals : Predicate<"!Subtarget->hasFP16Denormals()">;
def NoFP32Denormals : Predicate<"!Subtarget->hasFP32Denormals()">;
def NoFP64Denormals : Predicate<"!Subtarget->hasFP64Denormals()">;
def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;

def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
Expand Down
27 changes: 27 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Expand Up @@ -1278,20 +1278,47 @@ defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;

let Predicates = [NoFP16Denormals] in {
def : Pat<
(fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
(V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0)
>;
}

let Predicates = [FP16Denormals] in {
def : Pat<
(fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
(V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)
>;
}

let Predicates = [NoFP32Denormals] in {
def : Pat<
(fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
(V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0)
>;
}

let Predicates = [FP32Denormals] in {
def : Pat<
(fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
(V_MAX_F32_e64 $src_mods, $src, $src_mods, $src, 0, 0)
>;
}

let Predicates = [NoFP64Denormals] in {
def : Pat<
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
(V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0)
>;
}

let Predicates = [FP64Denormals] in {
def : Pat<
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
(V_MAX_F64 $src_mods, $src, $src_mods, $src, 0, 0)
>;
}

def : Pat<
(fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
Expand Down
15 changes: 10 additions & 5 deletions llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
Expand Up @@ -4,7 +4,8 @@
; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GCN-FLUSH %s

; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32:
; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
; GFX9-DENORM: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
Expand Down Expand Up @@ -129,7 +130,8 @@ define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrsp

; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32:
; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]],
; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]]
; GCN-FLUSH: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]]
; GCN-DENORM: v_max_f32_e32 [[V:v[0-9]+]], [[LOAD]], [[LOAD]]
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
; GCN-NOT: 1.0
define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) {
Expand Down Expand Up @@ -223,7 +225,8 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x
}

; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32:
; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, -v{{[0-9]+}}
; GCN-FLUSH: v_mul_f32_e64 v{{[0-9]+}}, 1.0, -v{{[0-9]+}}
; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
Expand All @@ -250,7 +253,8 @@ define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace
}

; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32:
; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
; GCN-FLUSH: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
Expand Down Expand Up @@ -377,7 +381,8 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspa

; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32:
; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7f800001, v{{[0-9]+}}
; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
; GCN-DENORM: v_max_f32_e32 v{{[0-9]+}}, [[V0]], [[V0]]
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
Expand Down
38 changes: 17 additions & 21 deletions llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
Expand Up @@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0


; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
; GCN: v_max_f16_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
; GCN: buffer_store_short [[REG]]
define amdgpu_kernel void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
%val = load half, half addrspace(1)* %out
Expand All @@ -19,7 +19,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(half addrspace(1)* %out)
}

; GCN-LABEL: {{^}}s_test_canonicalize_var_f16:
; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}}
; GCN: v_max_f16_e64 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
; GCN: buffer_store_short [[REG]]
define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 {
%val = bitcast i16 %val.arg to half
Expand All @@ -29,7 +29,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out,
}

; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16:
; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}|
; GCN: v_max_f16_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}|
; GCN: buffer_store_short [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 {
%val = load half, half addrspace(1)* %out
Expand All @@ -40,7 +40,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %
}

; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f16:
; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -|{{v[0-9]+}}|
; GCN: v_max_f16_e64 [[REG:v[0-9]+]], -|{{v[0-9]+}}|, -|{{v[0-9]+}}|
; GCN: buffer_store_short [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 {
%val = load half, half addrspace(1)* %out
Expand All @@ -52,7 +52,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(
}

; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f16:
; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -{{v[0-9]+}}
; GCN: v_max_f16_e64 [[REG:v[0-9]+]], -{{v[0-9]+}}, -{{v[0-9]+}}
; GCN: buffer_store_short [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 {
%val = load half, half addrspace(1)* %out
Expand Down Expand Up @@ -207,9 +207,8 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace
}

; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f16:
; VI: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
; VI-DAG: v_mul_f16_sdwa [[REG0:v[0-9]+]], v[[CONST1]], {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, {{v[0-9]+}}
; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
; VI-NOT: v_and_b32

; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}}
Expand All @@ -227,9 +226,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)
; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16:
; VI-DAG: v_bfe_u32
; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}}
; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
; VI: v_mul_f16_sdwa [[REG0:v[0-9]+]], v[[CONST1]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI: v_max_f16_e32 [[REG1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
; VI-NOT: 0xffff
; VI: v_or_b32

Expand All @@ -247,10 +245,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspa
}

; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16:
; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
; VI-DAG: v_mul_f16_sdwa [[REG0:v[0-9]+]], v[[CONST1]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
; VI: v_or_b32

; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
Expand All @@ -269,10 +266,10 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> ad

; FIXME: Fold modifier
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16:
; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
; VI-DAG: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}}
; VI-DAG: v_mul_f16_sdwa [[REG1:v[0-9]+]], v[[CONST1]], [[FNEG]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-DAG: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]]
; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}}
; VI: v_lshrrev_b32_e32 [[FNEGHI:v[0-9]+]], 16, [[FNEG]]
; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], [[FNEG]], [[FNEGHI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_max_f16_e32 [[REG0:v[0-9]+]], [[FNEG]], [[FNEG]]
; VI-NOT: 0xffff

; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}}
Expand All @@ -288,9 +285,8 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspa
}

; GCN-LABEL: {{^}}s_test_canonicalize_var_v2f16:
; VI: v_mov_b32_e32 [[ONE:v[0-9]+]], 0x3c00
; VI: v_mul_f16_sdwa [[REG0:v[0-9]+]], [[ONE]], {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}}
; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI: v_max_f16_e64 [[REG1:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
; VI-NOT: v_and_b32

; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{s[0-9]+$}}
Expand Down
88 changes: 82 additions & 6 deletions llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -1,9 +1,11 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

declare float @llvm.fabs.f32(float) #0
declare float @llvm.canonicalize.f32(float) #0
declare double @llvm.fabs.f64(double) #0
declare double @llvm.canonicalize.f64(double) #0
declare half @llvm.canonicalize.f16(half) #0
declare i32 @llvm.amdgcn.workitem.id.x() #0

; GCN-LABEL: {{^}}v_test_canonicalize_var_f32:
; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
Expand Down Expand Up @@ -203,7 +205,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(float addrspac
}

; GCN-LABEL: {{^}}v_test_canonicalize_var_f64:
; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{v\[[0-9]+:[0-9]+\]}}
; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
%val = load double, double addrspace(1)* %out
Expand All @@ -213,7 +215,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(double addrspace(1)* %out
}

; GCN-LABEL: {{^}}s_test_canonicalize_var_f64:
; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{s\[[0-9]+:[0-9]+\]}}
; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 {
%canonicalized = call double @llvm.canonicalize.f64(double %val)
Expand All @@ -222,7 +224,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(double addrspace(1)* %out
}

; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f64:
; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, |{{v\[[0-9]+:[0-9]+\]}}|
; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], |{{v\[[0-9]+:[0-9]+\]}}|, |{{v\[[0-9]+:[0-9]+\]}}|
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 {
%val = load double, double addrspace(1)* %out
Expand All @@ -233,7 +235,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)*
}

; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f64:
; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]\]]], 1.0, -|{{v\[[0-9]+:[0-9]+\]}}|
; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]\]]], -|{{v\[[0-9]+:[0-9]+\]}}|, -|{{v\[[0-9]+:[0-9]+\]}}|
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 {
%val = load double, double addrspace(1)* %out
Expand All @@ -245,7 +247,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspac
}

; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f64:
; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, -{{v\[[0-9]+:[0-9]+\]}}
; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -{{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 {
%val = load double, double addrspace(1)* %out
Expand Down Expand Up @@ -415,7 +417,81 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(double addrspa
ret void
}

; GCN-LABEL: {{^}}test_canonicalize_value_f64_flush:
; GCN: v_mul_f64 v[{{[0-9:]+}}], 1.0, v[{{[0-9:]+}}]
define amdgpu_kernel void @test_canonicalize_value_f64_flush(double addrspace(1)* %arg, double addrspace(1)* %out) #4 {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
%v = load double, double addrspace(1)* %gep, align 8
%canonicalized = tail call double @llvm.canonicalize.f64(double %v)
%gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
store double %canonicalized, double addrspace(1)* %gep2, align 8
ret void
}

; GCN-LABEL: {{^}}test_canonicalize_value_f32_flush:
; GCN: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
define amdgpu_kernel void @test_canonicalize_value_f32_flush(float addrspace(1)* %arg, float addrspace(1)* %out) #4 {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%v = load float, float addrspace(1)* %gep, align 4
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
%gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
store float %canonicalized, float addrspace(1)* %gep2, align 4
ret void
}

; GCN-LABEL: {{^}}test_canonicalize_value_f16_flush:
; GCN: v_mul_f16_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
define amdgpu_kernel void @test_canonicalize_value_f16_flush(half addrspace(1)* %arg, half addrspace(1)* %out) #4 {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
%v = load half, half addrspace(1)* %gep, align 2
%canonicalized = tail call half @llvm.canonicalize.f16(half %v)
%gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
store half %canonicalized, half addrspace(1)* %gep2, align 2
ret void
}

; GCN-LABEL: {{^}}test_canonicalize_value_f64_denorm:
; GCN: v_max_f64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
define amdgpu_kernel void @test_canonicalize_value_f64_denorm(double addrspace(1)* %arg, double addrspace(1)* %out) #5 {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
%v = load double, double addrspace(1)* %gep, align 8
%canonicalized = tail call double @llvm.canonicalize.f64(double %v)
%gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
store double %canonicalized, double addrspace(1)* %gep2, align 8
ret void
}

; GCN-LABEL: {{^}}test_canonicalize_value_f32_denorm:
; GCN: v_max_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @test_canonicalize_value_f32_denorm(float addrspace(1)* %arg, float addrspace(1)* %out) #5 {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%v = load float, float addrspace(1)* %gep, align 4
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
%gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
store float %canonicalized, float addrspace(1)* %gep2, align 4
ret void
}

; GCN-LABEL: {{^}}test_canonicalize_value_f16_denorm:
; GCN: v_max_f16_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @test_canonicalize_value_f16_denorm(half addrspace(1)* %arg, half addrspace(1)* %out) #5 {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
%v = load half, half addrspace(1)* %gep, align 2
%canonicalized = tail call half @llvm.canonicalize.f16(half %v)
%gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
store half %canonicalized, half addrspace(1)* %gep2, align 2
ret void
}

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
attributes #2 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" }
attributes #3 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" }
attributes #4 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" "target-cpu"="tonga" }
attributes #5 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" "target-cpu"="gfx900" }

0 comments on commit 06cab79

Please sign in to comment.