diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 97852523033dde..9f38e92c434d57 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1063,6 +1063,9 @@ def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, AssemblerPredicate<(all_of FeatureVOP3P)>; +def HasMinMaxDenormModes : Predicate<"Subtarget->supportsMinMaxDenormModes()">; +def NotHasMinMaxDenormModes : Predicate<"!Subtarget->supportsMinMaxDenormModes()">; + def HasSDWA : Predicate<"Subtarget->hasSDWA()">, AssemblerPredicate<(all_of FeatureSDWA, FeatureVolcanicIslands)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 483b1568e532ed..706053a4e3157e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -77,6 +77,9 @@ class ILFormat pattern> def TruePredicate : Predicate<"">; +// FIXME: Tablegen should specially supports this +def FalsePredicate : Predicate<"false">; + // Add a predicate to the list if does not already exist to deduplicate it. class PredConcat lst, Predicate pred> { list ret = diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 40eebb044fd12d..dae8b0b7d39aab 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1878,7 +1878,9 @@ def : GCNPat < } -let OtherPredicates = [NoFP16Denormals] in { + +// Prefer selecting to max when legal, but using mul is always valid. +let AddedComplexity = -5 in { def : GCNPat< (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) @@ -1893,23 +1895,7 @@ def : GCNPat< (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) >; -} - -let OtherPredicates = [FP16Denormals] in { -def : GCNPat< - (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), - (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0) ->; -let SubtargetPredicate = HasVOP3PInsts in { -def : GCNPat< - (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), - (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE) ->; -} -} - -let OtherPredicates = [NoFP32Denormals] in { def : GCNPat< (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src) @@ -1919,29 +1905,69 @@ def : GCNPat< (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))), (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src) >; -} - -let OtherPredicates = [FP32Denormals] in { -def : GCNPat< - (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), - (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src) ->; -} -let OtherPredicates = [NoFP64Denormals] in { +// TODO: Handle fneg like other types. def : GCNPat< (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src) >; -} +} // End AddedComplexity = -5 -let OtherPredicates = [FP64Denormals] in { -def : GCNPat< - (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), - (V_MAX_F64 $src_mods, $src, $src_mods, $src) ->; +multiclass SelectCanonicalizeAsMax< + list f32_preds = [], + list f64_preds = [], + list f16_preds = []> { + def : GCNPat< + (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), + (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src)> { + let OtherPredicates = f32_preds; + } + + def : GCNPat< + (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), + (V_MAX_F64 $src_mods, $src, $src_mods, $src)> { + let OtherPredicates = f64_preds; + } + + def : GCNPat< + (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), + (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { + // FIXME: Should have 16-bit inst subtarget predicate + let OtherPredicates = f16_preds; + } + + def : GCNPat< + (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), + (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)> { + // FIXME: Should have VOP3P subtarget predicate + let OtherPredicates = f16_preds; + } } +// On pre-gfx9 targets, v_max_*/v_min_* did not respect the denormal +// mode, and would never flush. For f64, it's faster to do implement +// this with a max. For f16/f32 it's a wash, but prefer max when +// valid. +// +// FIXME: Lowering f32/f16 with max is worse since we can use a +// smaller encoding if the input is fneg'd. It also adds an extra +// register use. +let SubtargetPredicate = HasMinMaxDenormModes in { + defm : SelectCanonicalizeAsMax<[], [], []>; +} // End SubtargetPredicate = HasMinMaxDenormModes + +let SubtargetPredicate = NotHasMinMaxDenormModes in { + // Use the max lowering if we don't need to flush. + + // FIXME: We don't do use this for f32 as a workaround for the + // library being compiled with the default ieee mode, but + // potentially being called from flushing kernels. Really we should + // not be mixing code expecting different default FP modes, but mul + // works in any FP environment. + defm : SelectCanonicalizeAsMax<[FalsePredicate], [FP64Denormals], [FP16Denormals]>; +} // End SubtargetPredicate = NotHasMinMaxDenormModes + + let OtherPredicates = [HasDLInsts] in { def : GCNPat < (fma (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)), diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir index b62e24921ebf74..75086984a142d7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir @@ -48,8 +48,8 @@ body: | ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F16_e64_]] ; GFX9-LABEL: name: fcanonicalize_f16_flush ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, 15360, 0, [[COPY]], 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F16_e64_]] + ; GFX9: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_FCANONICALIZE %1 @@ -72,8 +72,8 @@ body: | ; GFX8-LABEL: name: fcanonicalize_f32_denorm ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX8: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec - ; GFX8: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]] + ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 0, [[COPY]], 0, 0, implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] ; GFX9-LABEL: name: fcanonicalize_f32_denorm ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec @@ -103,8 +103,8 @@ body: | ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] ; GFX9-LABEL: name: fcanonicalize_f32_flush ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 0, [[COPY]], 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] + ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_FCANONICALIZE %0 S_ENDPGM 0, implicit %1 @@ -125,9 +125,9 @@ body: | liveins: $vgpr0 ; GFX8-LABEL: name: fcanonicalize_v2f16_denorm - ; GFX8: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX8: [[FCANONICALIZE:%[0-9]+]]:vgpr(<2 x s16>) = G_FCANONICALIZE [[COPY]] - ; GFX8: S_ENDPGM 0, implicit [[FCANONICALIZE]](<2 x s16>) + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_PK_MAX_F16_]] ; GFX9-LABEL: name: fcanonicalize_v2f16_denorm ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec @@ -157,8 +157,8 @@ body: | ; GFX8: S_ENDPGM 0, implicit [[V_PK_MUL_F16_]] ; GFX9-LABEL: name: fcanonicalize_v2f16_flush ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 0, 15360, 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_PK_MUL_F16_]] + ; GFX9: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_MAX_F16_]] %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(<2 x s16>) = G_FCANONICALIZE %0 S_ENDPGM 0, implicit %1 @@ -211,8 +211,8 @@ body: | ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F64_]] ; GFX9-LABEL: name: fcanonicalize_f64_flush ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[V_MUL_F64_:%[0-9]+]]:vreg_64 = V_MUL_F64 0, 4607182418800017408, 0, [[COPY]], 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F64_]] + ; GFX9: [[V_MAX_F64_:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F64_]] %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_FCANONICALIZE %0 S_ENDPGM 0, implicit %1 @@ -233,8 +233,8 @@ body: | liveins: $vgpr0 ; GFX8-LABEL: name: fcanonicalize_fabs_f32_denorm ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX8: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[COPY]], 2, [[COPY]], 0, 0, implicit $exec - ; GFX8: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]] + ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 2, [[COPY]], 0, 0, implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] ; GFX9-LABEL: name: fcanonicalize_fabs_f32_denorm ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[COPY]], 2, [[COPY]], 0, 0, implicit $exec @@ -265,8 +265,8 @@ body: | ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] ; GFX9-LABEL: name: fcanonicalize_fabs_f32_flush ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 2, [[COPY]], 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] + ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[COPY]], 2, [[COPY]], 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_FABS %0 %2:vgpr(s32) = G_FCANONICALIZE %1 @@ -288,8 +288,8 @@ body: | liveins: $vgpr0 ; GFX8-LABEL: name: fcanonicalize_fneg_f32_denorm ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX8: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 1, [[COPY]], 1, [[COPY]], 0, 0, implicit $exec - ; GFX8: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]] + ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 3212836864, 0, [[COPY]], 0, 0, implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] ; GFX9-LABEL: name: fcanonicalize_fneg_f32_denorm ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 1, [[COPY]], 1, [[COPY]], 0, 0, implicit $exec @@ -319,8 +319,8 @@ body: | ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] ; GFX9-LABEL: name: fcanonicalize_fneg_f32_flush ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 3212836864, 0, [[COPY]], 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] + ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 1, [[COPY]], 1, [[COPY]], 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_FNEG %0 %2:vgpr(s32) = G_FCANONICALIZE %1 @@ -344,8 +344,8 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX8: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[S_MOV_B32_]], [[COPY]], implicit $exec - ; GFX8: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[V_XOR_B32_e32_]], 2, [[V_XOR_B32_e32_]], 0, 0, implicit $exec - ; GFX8: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]] + ; GFX8: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 2, [[V_XOR_B32_e32_]], 0, 0, implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] ; GFX9-LABEL: name: fcanonicalize_fneg_fabs_f32_denorm ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 @@ -382,8 +382,8 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX9: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[S_MOV_B32_]], [[COPY]], implicit $exec - ; GFX9: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 2, [[V_XOR_B32_e32_]], 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] + ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[V_XOR_B32_e32_]], 2, [[V_XOR_B32_e32_]], 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_FNEG %0 %2:vgpr(s32) = G_FABS %1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll index 2ad2ee3502d8a7..d306268ecb5135 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll @@ -1,10 +1,10 @@ -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}kernel_ieee_mode_default: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_max_f32_e32 [[QUIET0:v[0-9]+]], [[VAL0]], [[VAL0]] -; GCN-DAG: v_max_f32_e32 [[QUIET1:v[0-9]+]], [[VAL1]], [[VAL1]] +; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] +; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] ; GCN-NOT: v_mul_f32 define amdgpu_kernel void @kernel_ieee_mode_default() #0 { @@ -18,8 +18,8 @@ define amdgpu_kernel void @kernel_ieee_mode_default() #0 { ; GCN-LABEL: {{^}}kernel_ieee_mode_on: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_max_f32_e32 [[QUIET0:v[0-9]+]], [[VAL0]], [[VAL0]] -; GCN-DAG: v_max_f32_e32 [[QUIET1:v[0-9]+]], [[VAL1]], [[VAL1]] +; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] +; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] ; GCN-NOT: v_mul_f32 define amdgpu_kernel void @kernel_ieee_mode_on() #1 { @@ -48,8 +48,8 @@ define amdgpu_kernel void @kernel_ieee_mode_off() #2 { ; GCN-LABEL: {{^}}func_ieee_mode_default: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_max_f32_e32 [[QUIET0:v[0-9]+]], [[VAL0]], [[VAL0]] -; GCN-DAG: v_max_f32_e32 [[QUIET1:v[0-9]+]], [[VAL1]], [[VAL1]] +; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] +; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] ; GCN-NOT: v_mul_f32 define void @func_ieee_mode_default() #0 { @@ -63,8 +63,8 @@ define void @func_ieee_mode_default() #0 { ; GCN-LABEL: {{^}}func_ieee_mode_on: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_max_f32_e32 [[QUIET0:v[0-9]+]], [[VAL0]], [[VAL0]] -; GCN-DAG: v_max_f32_e32 [[QUIET1:v[0-9]+]], [[VAL1]], [[VAL1]] +; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] +; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] ; GCN-NOT: v_mul_f32 define void @func_ieee_mode_on() #1 { @@ -93,8 +93,8 @@ define void @func_ieee_mode_off() #2 { ; GCN-LABEL: {{^}}cs_ieee_mode_default: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_max_f32_e32 [[QUIET0:v[0-9]+]], [[VAL0]], [[VAL0]] -; GCN-DAG: v_max_f32_e32 [[QUIET1:v[0-9]+]], [[VAL1]], [[VAL1]] +; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] +; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] ; GCN-NOT: v_mul_f32 define amdgpu_cs void @cs_ieee_mode_default() #0 { @@ -108,8 +108,8 @@ define amdgpu_cs void @cs_ieee_mode_default() #0 { ; GCN-LABEL: {{^}}cs_ieee_mode_on: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_max_f32_e32 [[QUIET0:v[0-9]+]], [[VAL0]], [[VAL0]] -; GCN-DAG: v_max_f32_e32 [[QUIET1:v[0-9]+]], [[VAL1]], [[VAL1]] +; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] +; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] ; GCN-NOT: v_mul_f32 define amdgpu_cs void @cs_ieee_mode_on() #1 { @@ -153,8 +153,8 @@ define amdgpu_ps void @ps_ieee_mode_default() #0 { ; GCN-LABEL: {{^}}ps_ieee_mode_on: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_max_f32_e32 [[QUIET0:v[0-9]+]], [[VAL0]], [[VAL0]] -; GCN-DAG: v_max_f32_e32 [[QUIET1:v[0-9]+]], [[VAL1]], [[VAL1]] +; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] +; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] ; GCN-NOT: v_mul_f32 define amdgpu_ps void @ps_ieee_mode_on() #1 { diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index df48096a368fcc..67ff2d0452ad6a 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,VI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GFX678 %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,VI,GFX678 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s ; GCN-LABEL: {{^}}v_clamp_f32: @@ -74,7 +74,8 @@ define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float a ; GCN-LABEL: {{^}}v_clamp_negzero_maybe_snan_f32: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]] +; GFX678: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]] +; GFX9: v_max_f32_e32 [[QUIET:v[0-9]+]], [[A]], [[A]] ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[QUIET]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]] define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { @@ -91,7 +92,8 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %o ; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] +; GFX678: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] +; GFX9: v_max_f32_e32 [[QUIET_A:v[0-9]+]], [[A]], [[A]] ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]] ; GCN: v_min_f32_e32 [[MED:v[0-9]+]], 1.0, [[QUIET_A]] ; GCN-NOT: [[MAX]] @@ -416,7 +418,8 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] +; GFX678: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] +; GFX9: v_max_f32_e32 [[QUIET_A:v[0-9]+]], [[A]], [[A]] ; GCN: v_med3_f32 {{v[0-9]+}}, [[QUIET_A]], 0, 1.0 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll index 3d8a1c7d74584c..b3fae8b5411201 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -5,8 +5,8 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s ; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32: -; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; GFX9-DENORM: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; GFX9: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -172,8 +172,9 @@ define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrsp ; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32: ; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]], -; GCN-FLUSH: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]] -; GCN-DENORM: v_max_f32_e32 [[V:v[0-9]+]], [[LOAD]], [[LOAD]] +; VI: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]] +; GFX9: v_max_f32_e32 [[V:v[0-9]+]], [[LOAD]], [[LOAD]] + ; GCN-NOT: v_mul ; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] @@ -305,8 +306,8 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x } ; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32: -; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, -1.0, v{{[0-9]+}} -; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} +; VI: v_mul_f32_e32 v{{[0-9]+}}, -1.0, v{{[0-9]+}} +; GFX9: v_max_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -334,8 +335,8 @@ define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace } ; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32: -; GCN-FLUSH: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}| -; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| +; VI: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}| +; GFX9: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -347,8 +348,9 @@ define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrsp } ; GCN-LABEL: test_no_fold_canonicalize_fcopysign_value_f32: -; GCN-FLUSH: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}| -; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| +; VI: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}| +; GFX9: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| + ; GCN-NOT: v_mul_ ; GCN-NOT: v_max_ define amdgpu_kernel void @test_no_fold_canonicalize_fcopysign_value_f32(float addrspace(1)* %arg, float %sign) { @@ -454,10 +456,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode: ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] -; GCN-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]] -; GCN-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]] -; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, [[QUIET]] +; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]] +; GFX9: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]] +; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, [[QUIET]] ; GCN-NOT: v_max ; GCN-NOT: v_mul @@ -512,8 +514,8 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspa ; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32: ; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]] -; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[LOAD]] -; GCN-DENORM: v_max_f32_e32 v{{[0-9]+}}, [[LOAD]], [[LOAD]] +; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[LOAD]] +; GFX9: v_max_f32_e32 v{{[0-9]+}}, [[LOAD]], [[LOAD]] define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -530,10 +532,9 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace ; GFX9-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]] ; GFX9-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[QUIET]] -; GFX9-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]] +; GFX9-FLUSH: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]] ; GFX9-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]] - ; VI-FLUSH: v_mul_f32_e32 [[QUIET_V0:v[0-9]+]], 1.0, [[VAL]] ; VI-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_V0]] @@ -647,7 +648,9 @@ entry: ; GFX9-DENORM: global_load_dword [[V:v[0-9]+]], ; GFX9-DENORM: global_store_dword v[{{[0-9:]+}}], [[V]] ; GFX9-DENORM-NOT: 1.0 -; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; GFX9-DENORM-NOT: v_max +; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; GFX9-FLUSH: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(float addrspace(1)* %arg, float addrspace(1)* %out) #1 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -736,13 +739,9 @@ define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float % ; GFX9: v_min_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 -; VI-FLUSH-DAG: v_mul_f32_e32 v0, 1.0, v0 -; VI-FLUSH-DAG: v_mul_f32_e32 v1, 1.0, v1 -; VI-FLUSH: v_min_f32_e32 v0, v0, v1 - -; VI-DENORM-DAG: v_max_f32_e32 v0, v0, v0 -; VI-DENORM-DAG: v_max_f32_e32 v1, v1, v1 -; VI-DENORM: v_min_f32_e32 v0, v0, v1 +; VI-DAG: v_mul_f32_e32 v0, 1.0, v0 +; VI-DAG: v_mul_f32_e32 v1, 1.0, v1 +; VI: v_min_f32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 define float @test_fold_canonicalize_minnum_value_ieee_mode(float %arg0, float %arg1) { diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index e179ef37b3f8c8..7a44d11ad091d7 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -98,7 +98,8 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* % } ; GCN-LABEL: {{^}}v_test_no_denormals_canonicalize_fneg_var_f16: -; GFX89: v_mul_f16_e32 [[REG:v[0-9]+]], -1.0, v{{[0-9]+}} +; VI: v_mul_f16_e32 [[REG:v[0-9]+]], -1.0, v{{[0-9]+}} +; GFX9: v_max_f16_e64 [[REG:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}} ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(half addrspace(1)* %out) #2 { %val = load half, half addrspace(1)* %out @@ -109,7 +110,9 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(half ad } ; GCN-LABEL: {{^}}v_test_no_denormals_canonicalize_fneg_fabs_var_f16: -; GFX89: v_mul_f16_e64 [[REG:v[0-9]+]], -1.0, |v{{[0-9]+}}| +; VI: v_mul_f16_e64 [[REG:v[0-9]+]], -1.0, |v{{[0-9]+}}| +; GFX9: v_max_f16_e64 [[REG:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}| + ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] ; CI: v_cvt_f32_f16_e64 {{v[0-9]+}}, -|{{v[0-9]+}}| diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index ad472a92f86e21..e302d59b1ac74d 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -1,4 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX678 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX678 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s declare float @llvm.fabs.f32(float) #0 declare float @llvm.canonicalize.f32(float) #0 @@ -16,8 +18,9 @@ declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0 declare i32 @llvm.amdgcn.workitem.id.x() #0 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f32: -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GCN: buffer_store_dword [[REG]] +; GFX678: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} +; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 { %val = load float, float addrspace(1)* %out %canonicalized = call float @llvm.canonicalize.f32(float %val) @@ -26,8 +29,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(float addrspace(1)* %out) } ; GCN-LABEL: {{^}}s_test_canonicalize_var_f32: -; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}} -; GCN: buffer_store_dword [[REG]] +; GFX678: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}} +; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}} +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 { %canonicalized = call float @llvm.canonicalize.f32(float %val) store float %canonicalized, float addrspace(1)* %out @@ -35,8 +39,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(float addrspace(1)* %out, } ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f32: -; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}| -; GCN: buffer_store_dword [[REG]] +; GFX678: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}| +; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}| +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* %out) #1 { %val = load float, float addrspace(1)* %out %val.fabs = call float @llvm.fabs.f32(float %val) @@ -46,8 +51,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f32: -; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], -1.0, |{{v[0-9]+}}| -; GCN: buffer_store_dword [[REG]] +; GFX678: v_mul_f32_e64 [[REG:v[0-9]+]], -1.0, |{{v[0-9]+}}| +; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], -|{{v[0-9]+}}|, -|{{v[0-9]+}}| +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace(1)* %out) #1 { %val = load float, float addrspace(1)* %out %val.fabs = call float @llvm.fabs.f32(float %val) @@ -58,8 +64,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f32: -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], -1.0, {{v[0-9]+}} -; GCN: buffer_store_dword [[REG]] +; GFX678: v_mul_f32_e32 [[REG:v[0-9]+]], -1.0, {{v[0-9]+}} +; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], -{{v[0-9]+}}, -{{v[0-9]+}} +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* %out) #1 { %val = load float, float addrspace(1)* %out %val.fneg = fsub float -0.0, %val @@ -70,7 +77,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* ; GCN-LABEL: {{^}}test_fold_canonicalize_undef_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: buffer_store_dword [[REG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_undef_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float undef) store float %canonicalized, float addrspace(1)* %out @@ -79,7 +86,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(float addrspace(1)* ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword [[REG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float 0.0) store float %canonicalized, float addrspace(1)* %out @@ -88,7 +95,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(float addrspace(1)* %ou ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f32: ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}} -; GCN: buffer_store_dword [[REG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float -0.0) store float %canonicalized, float addrspace(1)* %out @@ -97,7 +104,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(float addrspace(1)* %ou ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}} -; GCN: buffer_store_dword [[REG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float 1.0) store float %canonicalized, float addrspace(1)* %out @@ -106,7 +113,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(float addrspace(1)* %ou ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}} -; GCN: buffer_store_dword [[REG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float -1.0) store float %canonicalized, float addrspace(1)* %out @@ -115,7 +122,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(float addrspace(1)* %ou ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x41800000{{$}} -; GCN: buffer_store_dword [[REG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float 16.0) store float %canonicalized, float addrspace(1)* %out @@ -124,7 +131,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(float addrspace(1) ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword [[REG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) store float %canonicalized, float addrspace(1)* %out @@ -133,7 +140,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(flo ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fffff{{$}} -; GCN: buffer_store_dword [[REG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) store float %canonicalized, float addrspace(1)* %out @@ -142,7 +149,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(float ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword [[REG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float)) store float %canonicalized, float addrspace(1)* %out @@ -151,7 +158,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(flo ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x807fffff{{$}} -; GCN: buffer_store_dword [[REG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float)) store float %canonicalized, float addrspace(1)* %out @@ -160,7 +167,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(float ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: buffer_store_dword [[REG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000) store float %canonicalized, float addrspace(1)* %out @@ -169,7 +176,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(float addrspace(1)* % ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: buffer_store_dword [[REG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float)) store float %canonicalized, float addrspace(1)* %out @@ -178,7 +185,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(float addr ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: buffer_store_dword [[REG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float)) store float %canonicalized, float addrspace(1)* %out @@ -187,7 +194,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(float addr ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: buffer_store_dword [[REG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float)) store float %canonicalized, float addrspace(1)* %out @@ -196,7 +203,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(float addrspac ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: buffer_store_dword [[REG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float)) store float %canonicalized, float addrspace(1)* %out @@ -205,7 +212,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(float addrspac ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: buffer_store_dword [[REG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float)) store float %canonicalized, float addrspace(1)* %out @@ -214,7 +221,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(float addrspac ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} -; GCN: buffer_store_dword [[REG]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 { %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float)) store float %canonicalized, float addrspace(1)* %out @@ -223,7 +230,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(float addrspac ; GCN-LABEL: {{^}}v_test_canonicalize_var_f64: ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} -; GCN: buffer_store_dwordx2 [[REG]] +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out %canonicalized = call double @llvm.canonicalize.f64(double %val) @@ -233,7 +240,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(double addrspace(1)* %out ; GCN-LABEL: {{^}}s_test_canonicalize_var_f64: ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -; GCN: buffer_store_dwordx2 [[REG]] +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 { %canonicalized = call double @llvm.canonicalize.f64(double %val) store double %canonicalized, double addrspace(1)* %out @@ -242,7 +249,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(double addrspace(1)* %out ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f64: ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], |{{v\[[0-9]+:[0-9]+\]}}|, |{{v\[[0-9]+:[0-9]+\]}}| -; GCN: buffer_store_dwordx2 [[REG]] +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out %val.fabs = call double @llvm.fabs.f64(double %val) @@ -253,7 +260,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f64: ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]\]]], -|{{v\[[0-9]+:[0-9]+\]}}|, -|{{v\[[0-9]+:[0-9]+\]}}| -; GCN: buffer_store_dwordx2 [[REG]] +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out %val.fabs = call double @llvm.fabs.f64(double %val) @@ -265,7 +272,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspac ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f64: ; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -{{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}} -; GCN: buffer_store_dwordx2 [[REG]] +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out %val.fneg = fsub double -0.0, %val @@ -277,7 +284,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f64: ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double 0.0) store double %canonicalized, double addrspace(1)* %out @@ -287,7 +294,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(double addrspace(1)* %o ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f64: ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN-DAG: v_bfrev_b32_e32 v[[HI:[0-9]+]], 1{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double -0.0) store double %canonicalized, double addrspace(1)* %out @@ -297,7 +304,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(double addrspace(1)* %o ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f64: ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x3ff00000{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double 1.0) store double %canonicalized, double addrspace(1)* %out @@ -307,7 +314,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(double addrspace(1)* %o ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f64: ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xbff00000{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double -1.0) store double %canonicalized, double addrspace(1)* %out @@ -317,7 +324,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(double addrspace(1)* %o ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f64: ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x40300000{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double 16.0) store double %canonicalized, double addrspace(1)* %out @@ -327,7 +334,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(double addrspace(1 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f64: ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double)) store double %canonicalized, double addrspace(1)* %out @@ -337,7 +344,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(dou ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f64: ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xfffff{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double)) store double %canonicalized, double addrspace(1)* %out @@ -347,7 +354,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(double ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f64: ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double)) store double %canonicalized, double addrspace(1)* %out @@ -357,7 +364,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(dou ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f64: ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x800fffff{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double)) store double %canonicalized, double addrspace(1)* %out @@ -367,7 +374,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(double ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f64: ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000) store double %canonicalized, double addrspace(1)* %out @@ -377,7 +384,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(double addrspace(1)* ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f64: ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double)) store double %canonicalized, double addrspace(1)* %out @@ -387,7 +394,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(double add ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f64: ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double)) store double %canonicalized, double addrspace(1)* %out @@ -397,7 +404,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(double add ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f64: ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double)) store double %canonicalized, double addrspace(1)* %out @@ -407,7 +414,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(double addrspa ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f64: ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double)) store double %canonicalized, double addrspace(1)* %out @@ -417,7 +424,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(double addrspa ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f64: ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double)) store double %canonicalized, double addrspace(1)* %out @@ -427,7 +434,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(double addrspa ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f64: ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 { %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double)) store double %canonicalized, double addrspace(1)* %out @@ -435,7 +442,8 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(double addrspa } ; GCN-LABEL: {{^}}test_canonicalize_value_f64_flush: -; GCN: v_mul_f64 v[{{[0-9:]+}}], 1.0, v[{{[0-9:]+}}] +; GFX678: v_mul_f64 v[{{[0-9:]+}}], 1.0, v[{{[0-9:]+}}] +; GCN9: v_max_f64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] define amdgpu_kernel void @test_canonicalize_value_f64_flush(double addrspace(1)* %arg, double addrspace(1)* %out) #4 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id @@ -447,7 +455,8 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(double addrspace(1) } ; GCN-LABEL: {{^}}test_canonicalize_value_f32_flush: -; GCN: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} +; GFX6: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} +; GFX9: v_max_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @test_canonicalize_value_f32_flush(float addrspace(1)* %arg, float addrspace(1)* %out) #4 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -459,7 +468,8 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(float addrspace(1)* } ; GCN-LABEL: {{^}}test_canonicalize_value_f16_flush: -; GCN: v_mul_f16_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} +; GFX8: v_mul_f16_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} +; GFX9: v_max_f16_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @test_canonicalize_value_f16_flush(half addrspace(1)* %arg, half addrspace(1)* %out) #4 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id @@ -470,23 +480,13 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(half addrspace(1)* ret void } -; GCN-LABEL: {{^}}test_canonicalize_value_v2f16_flush_gfx8: -; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 0x3c00 -; GCN-DAG: v_mul_f16_sdwa v{{[0-9]+}}, [[ONE]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GCN-DAG: v_mul_f16_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -define amdgpu_kernel void @test_canonicalize_value_v2f16_flush_gfx8(<2 x half> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) #4 { - %id = tail call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i32 %id - %v = load <2 x half>, <2 x half> addrspace(1)* %gep, align 4 - %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v) - %gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i32 %id - store <2 x half> %canonicalized, <2 x half> addrspace(1)* %gep2, align 2 - ret void -} +; GCN-LABEL: {{^}}test_canonicalize_value_v2f16_flush: +; GFX8: v_mov_b32_e32 [[ONE:v[0-9]+]], 0x3c00 +; GFX8-DAG: v_mul_f16_sdwa v{{[0-9]+}}, [[ONE]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-DAG: v_mul_f16_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; GCN-LABEL: {{^}}test_canonicalize_value_v2f16_flush_gfx9: -; GCN-DAG: v_pk_mul_f16 v{{[0-9]+}}, 1.0, v{{[0-9]+}} op_sel_hi:[0,1]{{$}} -define amdgpu_kernel void @test_canonicalize_value_v2f16_flush_gfx9(<2 x half> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) #6 { +; GFX9: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(<2 x half> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) #4 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i32 %id %v = load <2 x half>, <2 x half> addrspace(1)* %gep, align 4 @@ -498,7 +498,7 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush_gfx9(<2 x half> a ; GCN-LABEL: {{^}}test_canonicalize_value_f64_denorm: ; GCN: v_max_f64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] -define amdgpu_kernel void @test_canonicalize_value_f64_denorm(double addrspace(1)* %arg, double addrspace(1)* %out) #5 { +define amdgpu_kernel void @test_canonicalize_value_f64_denorm(double addrspace(1)* %arg, double addrspace(1)* %out) #3 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id %v = load double, double addrspace(1)* %gep, align 8 @@ -509,8 +509,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(double addrspace(1 } ; GCN-LABEL: {{^}}test_canonicalize_value_f32_denorm: -; GCN: v_max_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define amdgpu_kernel void @test_canonicalize_value_f32_denorm(float addrspace(1)* %arg, float addrspace(1)* %out) #5 { +; GFX678: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} +; GFX9: v_max_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define amdgpu_kernel void @test_canonicalize_value_f32_denorm(float addrspace(1)* %arg, float addrspace(1)* %out) #3 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id %v = load float, float addrspace(1)* %gep, align 4 @@ -520,9 +521,12 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(float addrspace(1) ret void } +; FIXME: Conversion to float should count as the canonicalize pre-gfx8 ; GCN-LABEL: {{^}}test_canonicalize_value_f16_denorm: -; GCN: v_max_f16_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define amdgpu_kernel void @test_canonicalize_value_f16_denorm(half addrspace(1)* %arg, half addrspace(1)* %out) #5 { +; GFX6: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} +; GFX8: v_max_f16_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; GFX9: v_max_f16_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define amdgpu_kernel void @test_canonicalize_value_f16_denorm(half addrspace(1)* %arg, half addrspace(1)* %out) #3 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id %v = load half, half addrspace(1)* %gep, align 2 @@ -533,8 +537,14 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(half addrspace(1)* } ; GCN-LABEL: {{^}}test_canonicalize_value_v2f16_denorm: -; GCN: v_pk_max_f16 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(<2 x half> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) #5 { +; GFX6: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} +; GFX6: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} + +; GFX8: v_max_f16_sdwa +; GFX8: v_max_f16_e32 + +; GFX9: v_pk_max_f16 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(<2 x half> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) #3 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i32 %id %v = load <2 x half>, <2 x half> addrspace(1)* %gep, align 4 @@ -556,43 +566,64 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(<2 x double> addrspace( ret void } -; GCN-LABEL: {{^}}v_test_canonicalize_v2f32: -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -define <2 x float> @v_test_canonicalize_v2f32(<2 x float> %arg) #1 { +; GCN-LABEL: {{^}}v_test_canonicalize_v2f32_flush: +; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} +; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} + +; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +define <2 x float> @v_test_canonicalize_v2f32_flush(<2 x float> %arg) #1 { %canon = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> %arg) ret <2 x float> %canon } -; GCN-LABEL: {{^}}v_test_canonicalize_v3f32: -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -define <3 x float> @v_test_canonicalize_v3f32(<3 x float> %arg) #1 { +; GCN-LABEL: {{^}}v_test_canonicalize_v3f32_flush: +; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} +; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} +; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} + +; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +define <3 x float> @v_test_canonicalize_v3f32_flush(<3 x float> %arg) #1 { %canon = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> %arg) ret <3 x float> %canon } -; GCN-LABEL: {{^}}v_test_canonicalize_v4f32: -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -define <4 x float> @v_test_canonicalize_v4f32(<4 x float> %arg) #1 { +; GCN-LABEL: {{^}}v_test_canonicalize_v4f32_flush: +; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} +; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} +; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} +; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} + +; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +define <4 x float> @v_test_canonicalize_v4f32_flush(<4 x float> %arg) #1 { %canon = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %arg) ret <4 x float> %canon } -; GCN-LABEL: {{^}}v_test_canonicalize_v8f32: -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} -define <8 x float> @v_test_canonicalize_v8f32(<8 x float> %arg) #1 { +; GCN-LABEL: {{^}}v_test_canonicalize_v8f32_flush: +; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} +; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} +; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} +; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} +; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} +; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} +; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} +; GFX6: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} + +; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +define <8 x float> @v_test_canonicalize_v8f32_flush(<8 x float> %arg) #1 { %canon = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> %arg) ret <8 x float> %canon } @@ -628,6 +659,4 @@ attributes #0 = { nounwind readnone } attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } attributes #3 = { nounwind "denormal-fp-math"="ieee,ieee" } -attributes #4 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "target-cpu"="tonga" } -attributes #5 = { nounwind "denormal-fp-math"="ieee,ieee" "target-cpu"="gfx900" } -attributes #6 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "target-cpu"="gfx900" } +attributes #4 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } diff --git a/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll b/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll index e37a1cead47d39..04fc0e35bee426 100644 --- a/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX678,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX678,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s declare double @llvm.minnum.f64(double, double) #0 declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) #0 @@ -7,26 +8,44 @@ declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) #0 declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) #0 declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0 -; FUNC-LABEL: {{^}}test_fmin_f64_ieee: -; SI: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]] -; SI: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]] -; SI-DAG: v_max_f64 [[QUIETA:v\[[0-9]+:[0-9]+\]]], [[A]], [[A]] -; SI-DAG: v_max_f64 [[QUIETB:v\[[0-9]+:[0-9]+\]]], [[B]], [[B]] -; SI: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[QUIETA]], [[QUIETB]] -define amdgpu_kernel void @test_fmin_f64_ieee([8 x i32], double %a, [8 x i32], double %b) nounwind { +; GCN-LABEL: {{^}}test_fmin_f64_ieee_noflush: +; GCN: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]] +; GCN: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]] + +; GCN-DAG: v_max_f64 [[QUIETA:v\[[0-9]+:[0-9]+\]]], [[A]], [[A]] +; GCN-DAG: v_max_f64 [[QUIETB:v\[[0-9]+:[0-9]+\]]], [[B]], [[B]] + +; GCN: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[QUIETA]], [[QUIETB]] +define amdgpu_kernel void @test_fmin_f64_ieee_noflush([8 x i32], double %a, [8 x i32], double %b) #1 { + %val = call double @llvm.minnum.f64(double %a, double %b) #0 + store double %val, double addrspace(1)* undef, align 8 + ret void +} + +; GCN-LABEL: {{^}}test_fmin_f64_ieee_flush: +; GCN: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]] +; GCN: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]] +; GFX678-DAG: v_mul_f64 [[QUIETA:v\[[0-9]+:[0-9]+\]]], 1.0, [[A]] +; GFX678-DAG: v_mul_f64 [[QUIETB:v\[[0-9]+:[0-9]+\]]], 1.0, [[B]] + +; GFX9-DAG: v_max_f64 [[QUIETA:v\[[0-9]+:[0-9]+\]]], [[A]], [[A]] +; GFX9-DAG: v_max_f64 [[QUIETB:v\[[0-9]+:[0-9]+\]]], [[B]], [[B]] + +; GCN: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[QUIETA]], [[QUIETB]] +define amdgpu_kernel void @test_fmin_f64_ieee_flush([8 x i32], double %a, [8 x i32], double %b) #2 { %val = call double @llvm.minnum.f64(double %a, double %b) #0 store double %val, double addrspace(1)* undef, align 8 ret void } -; FUNC-LABEL: {{^}}test_fmin_f64_no_ieee: -; SI: ds_read_b64 [[VAL0:v\[[0-9]+:[0-9]+\]]] -; SI: ds_read_b64 [[VAL1:v\[[0-9]+:[0-9]+\]]] -; SI-NOT: [[VAL0]] -; SI-NOT: [[VAL1]] -; SI: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VAL0]], [[VAL1]] -; SI-NOT: [[RESULT]] -; SI: ds_write_b64 v{{[0-9]+}}, [[RESULT]] +; GCN-LABEL: {{^}}test_fmin_f64_no_ieee: +; GCN: ds_read_b64 [[VAL0:v\[[0-9]+:[0-9]+\]]] +; GCN: ds_read_b64 [[VAL1:v\[[0-9]+:[0-9]+\]]] +; GCN-NOT: [[VAL0]] +; GCN-NOT: [[VAL1]] +; GCN: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VAL0]], [[VAL1]] +; GCN-NOT: [[RESULT]] +; GCN: ds_write_b64 v{{[0-9]+}}, [[RESULT]] define amdgpu_ps void @test_fmin_f64_no_ieee() nounwind { %a = load volatile double, double addrspace(3)* undef %b = load volatile double, double addrspace(3)* undef @@ -35,58 +54,58 @@ define amdgpu_ps void @test_fmin_f64_no_ieee() nounwind { ret void } -; FUNC-LABEL: {{^}}test_fmin_v2f64: -; SI: v_min_f64 -; SI: v_min_f64 +; GCN-LABEL: {{^}}test_fmin_v2f64: +; GCN: v_min_f64 +; GCN: v_min_f64 define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0 store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}test_fmin_v4f64: -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 +; GCN-LABEL: {{^}}test_fmin_v4f64: +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 define amdgpu_kernel void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { %val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0 store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32 ret void } -; FUNC-LABEL: {{^}}test_fmin_v8f64: -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 +; GCN-LABEL: {{^}}test_fmin_v8f64: +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 define amdgpu_kernel void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { %val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0 store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64 ret void } -; FUNC-LABEL: {{^}}test_fmin_v16f64: -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 +; GCN-LABEL: {{^}}test_fmin_v16f64: +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 +; GCN: v_min_f64 define amdgpu_kernel void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { %val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0 store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128 @@ -94,3 +113,5 @@ define amdgpu_kernel void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <1 } attributes #0 = { nounwind readnone } +attributes #1 = { nounwind "denormal-fp-math"="ieee,ieee" } +attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }