diff --git a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll index 609769fd51488..1328025f1c3c2 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll @@ -2,154 +2,63 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOW16,ALL %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST16,ALL %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOW16,ALL %s +; END. - -; ALL: 'add_i32' +; ALL-LABEL: 'add_i32' ; ALL: estimated cost of 1 for {{.*}} add i32 -define amdgpu_kernel void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { - %vec = load i32, i32 addrspace(1)* %vaddr - %add = add i32 %vec, %b - store i32 %add, i32 addrspace(1)* %out - ret void -} - -; ALL: 'add_v2i32' ; ALL: estimated cost of 2 for {{.*}} add <2 x i32> -define amdgpu_kernel void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 { - %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr - %add = add <2 x i32> %vec, %b - store <2 x i32> %add, <2 x i32> addrspace(1)* %out - ret void -} - -; ALL: 'add_v3i32' -; Allow for 4 when v3i32 is illegal and TargetLowering thinks it needs widening, -; and 3 when it is legal. +;;; Allow for 4 when v3i32 is illegal and TargetLowering thinks it needs widening, +;;; and 3 when it is legal. ; ALL: estimated cost of {{[34]}} for {{.*}} add <3 x i32> -define amdgpu_kernel void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 { - %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr - %add = add <3 x i32> %vec, %b - store <3 x i32> %add, <3 x i32> addrspace(1)* %out - ret void -} - -; ALL: 'add_v4i32' ; ALL: estimated cost of 4 for {{.*}} add <4 x i32> -define amdgpu_kernel void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 { - %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr - %add = add <4 x i32> %vec, %b - store <4 x i32> %add, <4 x i32> addrspace(1)* %out - ret void -} - -; ALL: 'add_v5i32' -; Allow for 8 when v3i32 is illegal and TargetLowering thinks it needs widening, -; and 5 when it is legal. +;;; Allow for 8 when v3i32 is illegal and TargetLowering thinks it needs widening, +;;; and 5 when it is legal. ; ALL: estimated cost of {{[58]}} for {{.*}} add <5 x i32> -define amdgpu_kernel void @add_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 { - %vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr - %add = add <5 x i32> %vec, %b - store <5 x i32> %add, <5 x i32> addrspace(1)* %out +define amdgpu_kernel void @add_i32() #0 { + %i32 = add i32 undef, undef + %v2i32 = add <2 x i32> undef, undef + %v3i32 = add <3 x i32> undef, undef + %v4i32 = add <4 x i32> undef, undef + %v5i32 = add <5 x i32> undef, undef ret void } -; ALL: 'add_i64' +; ALL-LABEL: 'add_i64' ; ALL: estimated cost of 2 for {{.*}} add i64 -define amdgpu_kernel void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { - %vec = load i64, i64 addrspace(1)* %vaddr - %add = add i64 %vec, %b - store i64 %add, i64 addrspace(1)* %out - ret void -} - -; ALL: 'add_v2i64' ; ALL: estimated cost of 4 for {{.*}} add <2 x i64> -define amdgpu_kernel void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 { - %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr - %add = add <2 x i64> %vec, %b - store <2 x i64> %add, <2 x i64> addrspace(1)* %out - ret void -} - -; ALL: 'add_v3i64' ; ALL: estimated cost of 6 for {{.*}} add <3 x i64> -define amdgpu_kernel void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 { - %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr - %add = add <3 x i64> %vec, %b - store <3 x i64> %add, <3 x i64> addrspace(1)* %out - ret void -} - -; ALL: 'add_v4i64' ; ALL: estimated cost of 8 for {{.*}} add <4 x i64> -define amdgpu_kernel void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 { - %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr - %add = add <4 x i64> %vec, %b - store <4 x i64> %add, <4 x i64> addrspace(1)* %out - ret void -} - -; ALL: 'add_v16i64' ; ALL: estimated cost of 128 for {{.*}} add <16 x i64> -define amdgpu_kernel void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 { - %vec = load <16 x i64>, <16 x i64> addrspace(1)* %vaddr - %add = add <16 x i64> %vec, %b - store <16 x i64> %add, <16 x i64> addrspace(1)* %out +define amdgpu_kernel void @add_i64() #0 { + %i64 = add i64 undef, undef + %v2i64 = add <2 x i64> undef, undef + %v3i64 = add <3 x i64> undef, undef + %v4i64 = add <4 x i64> undef, undef + %v16i64 = add <16 x i64> undef, undef ret void } -; ALL: 'add_i16' +; ALL-LABEL: 'add_i16' ; ALL: estimated cost of 1 for {{.*}} add i16 -define amdgpu_kernel void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 { - %vec = load i16, i16 addrspace(1)* %vaddr - %add = add i16 %vec, %b - store i16 %add, i16 addrspace(1)* %out - ret void -} - -; ALL: 'add_v2i16' ; SLOW16: estimated cost of 2 for {{.*}} add <2 x i16> ; FAST16: estimated cost of 1 for {{.*}} add <2 x i16> -define amdgpu_kernel void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr - %add = add <2 x i16> %vec, %b - store <2 x i16> %add, <2 x i16> addrspace(1)* %out +define amdgpu_kernel void @add_i16() #0 { + %i16 = add i16 undef, undef + %v2i16 = add <2 x i16> undef, undef ret void } -; ALL: 'sub_i32' +; ALL-LABEL: 'sub' ; ALL: estimated cost of 1 for {{.*}} sub i32 -define amdgpu_kernel void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { - %vec = load i32, i32 addrspace(1)* %vaddr - %sub = sub i32 %vec, %b - store i32 %sub, i32 addrspace(1)* %out - ret void -} - -; ALL: 'sub_i64' ; ALL: estimated cost of 2 for {{.*}} sub i64 -define amdgpu_kernel void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { - %vec = load i64, i64 addrspace(1)* %vaddr - %sub = sub i64 %vec, %b - store i64 %sub, i64 addrspace(1)* %out - ret void -} -; ALL: 'sub_i16' ; ALL: estimated cost of 1 for {{.*}} sub i16 -define amdgpu_kernel void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 { - %vec = load i16, i16 addrspace(1)* %vaddr - %sub = sub i16 %vec, %b - store i16 %sub, i16 addrspace(1)* %out - ret void -} - -; ALL: 'sub_v2i16' ; SLOW16: estimated cost of 2 for {{.*}} sub <2 x i16> ; FAST16: estimated cost of 1 for {{.*}} sub <2 x i16> -define amdgpu_kernel void @sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr - %sub = sub <2 x i16> %vec, %b - store <2 x i16> %sub, <2 x i16> addrspace(1)* %out +define amdgpu_kernel void @sub() #0 { + %i32 = sub i32 undef, undef + %i64 = sub i64 undef, undef + %i16 = sub i16 undef, undef + %v2i16 = sub <2 x i16> undef, undef ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/addrspacecast.ll b/llvm/test/Analysis/CostModel/AMDGPU/addrspacecast.ll index a87a965c6bfd0..8ca13eed2f43f 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/addrspacecast.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/addrspacecast.ll @@ -1,5 +1,6 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck %s +; END. ; CHECK-LABEL: 'addrspacecast_global_to_flat' ; CHECK: estimated cost of 0 for {{.*}} addrspacecast i8 addrspace(1)* %ptr to i8* diff --git a/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll b/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll index 2dec5f350936d..63f7ab74e2006 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll @@ -2,88 +2,41 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,FAST16 %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL,SLOW16 %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,FAST16 %s +; END. -; ALL: 'or_i32' +; ALL-LABEL: 'or' ; ALL: estimated cost of 1 for {{.*}} or i32 -define amdgpu_kernel void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { - %vec = load i32, i32 addrspace(1)* %vaddr - %or = or i32 %vec, %b - store i32 %or, i32 addrspace(1)* %out - ret void -} - -; ALL: 'or_i64' ; ALL: estimated cost of 2 for {{.*}} or i64 -define amdgpu_kernel void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { - %vec = load i64, i64 addrspace(1)* %vaddr - %or = or i64 %vec, %b - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; ALL: 'or_v2i16' ; SLOW16: estimated cost of 2 for {{.*}} or <2 x i16> ; FAST16: estimated cost of 1 for {{.*}} or <2 x i16> -define amdgpu_kernel void @or_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr - %or = or <2 x i16> %vec, %b - store <2 x i16> %or, <2 x i16> addrspace(1)* %out +define amdgpu_kernel void @or() #0 { + %i32 = or i32 undef, undef + %i64 = or i64 undef, undef + %v2i16 = or <2 x i16> undef, undef ret void } -; ALL: 'xor_i32' +; ALL-LABEL: 'xor' ; ALL: estimated cost of 1 for {{.*}} xor i32 -define amdgpu_kernel void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { - %vec = load i32, i32 addrspace(1)* %vaddr - %or = xor i32 %vec, %b - store i32 %or, i32 addrspace(1)* %out - ret void -} - -; ALL: 'xor_i64' ; ALL: estimated cost of 2 for {{.*}} xor i64 -define amdgpu_kernel void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { - %vec = load i64, i64 addrspace(1)* %vaddr - %or = xor i64 %vec, %b - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; ALL: 'xor_v2i16' ; SLOW16: estimated cost of 2 for {{.*}} xor <2 x i16> ; FAST16: estimated cost of 1 for {{.*}} xor <2 x i16> -define amdgpu_kernel void @xor_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr - %xor = xor <2 x i16> %vec, %b - store <2 x i16> %xor, <2 x i16> addrspace(1)* %out +define amdgpu_kernel void @xor() #0 { + %i32 = xor i32 undef, undef + %i64 = xor i64 undef, undef + %v2i16 = xor <2 x i16> undef, undef ret void } -; ALL: 'and_i32' +; ALL-LABEL: 'and' ; ALL: estimated cost of 1 for {{.*}} and i32 -define amdgpu_kernel void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { - %vec = load i32, i32 addrspace(1)* %vaddr - %or = and i32 %vec, %b - store i32 %or, i32 addrspace(1)* %out - ret void -} - -; ALL: 'and_i64' ; ALL: estimated cost of 2 for {{.*}} and i64 -define amdgpu_kernel void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { - %vec = load i64, i64 addrspace(1)* %vaddr - %or = and i64 %vec, %b - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; ALL: 'and_v2i16' ; SLOW16: estimated cost of 2 for {{.*}} and <2 x i16> ; FAST16: estimated cost of 1 for {{.*}} and <2 x i16> -define amdgpu_kernel void @and_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr - %and = and <2 x i16> %vec, %b - store <2 x i16> %and, <2 x i16> addrspace(1)* %out +define amdgpu_kernel void @and() #0 { + %i32 = and i32 undef, undef + %i64 = and i64 undef, undef + %v2i16 = and <2 x i16> undef, undef ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/control-flow.ll b/llvm/test/Analysis/CostModel/AMDGPU/control-flow.ll index 88c4956f3fc0e..dbe55ff7bfc86 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/control-flow.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/control-flow.ll @@ -1,15 +1,16 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck --check-prefixes=ALL,SPEED %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck --check-prefixes=ALL,SIZE %s +; END. ; ALL-LABEL: 'test_br_cost' -; SPEED: estimated cost of 7 for instruction: br i1 +; SPEED-NEXT: estimated cost of 7 for instruction: br i1 ; SPEED: estimated cost of 4 for instruction: br label -; SPEED: estimated cost of 1 for instruction: %phi = phi i32 [ -; SPEED: estimated cost of 10 for instruction: ret void -; SIZE: estimated cost of 5 for instruction: br i1 +; SPEED-NEXT: estimated cost of 1 for instruction: %phi = phi i32 [ +; SPEED-NEXT: estimated cost of 10 for instruction: ret void +; SIZE-NEXT: estimated cost of 5 for instruction: br i1 ; SIZE: estimated cost of 1 for instruction: br label -; SIZE: estimated cost of 0 for instruction: %phi = phi i32 [ -; SIZE: estimated cost of 1 for instruction: ret void +; SIZE-NEXT: estimated cost of 0 for instruction: %phi = phi i32 [ +; SIZE-NEXT: estimated cost of 1 for instruction: ret void define amdgpu_kernel void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { bb0: br i1 undef, label %bb1, label %bb2 @@ -26,8 +27,8 @@ bb2: } ; ALL-LABEL: 'test_switch_cost' -; SPEED: estimated cost of 24 for instruction: switch -; SIZE: estimated cost of 18 for instruction: switch +; SPEED-NEXT: estimated cost of 24 for instruction: switch +; SIZE-NEXT: estimated cost of 18 for instruction: switch define amdgpu_kernel void @test_switch_cost(i32 %a) #0 { entry: switch i32 %a, label %default [ diff --git a/llvm/test/Analysis/CostModel/AMDGPU/extractelement.ll b/llvm/test/Analysis/CostModel/AMDGPU/extractelement.ll index 67ce8ffba936e..1af8d862732c3 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/extractelement.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/extractelement.ll @@ -4,141 +4,55 @@ ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa %s | FileCheck -check-prefixes=GCN,CI %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=GCN,GFX89 %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GCN,GFX89 %s - - -; GCN: 'extractelement_v2i32' -; GCN: estimated cost of 0 for {{.*}} extractelement <2 x i32> -define amdgpu_kernel void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) { - %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr - %elt = extractelement <2 x i32> %vec, i32 1 - store i32 %elt, i32 addrspace(1)* %out - ret void -} - -; GCN: 'extractelement_v2f32' -; GCN: estimated cost of 0 for {{.*}} extractelement <2 x float> -define amdgpu_kernel void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) { - %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr - %elt = extractelement <2 x float> %vec, i32 1 - store float %elt, float addrspace(1)* %out - ret void -} - -; GCN: 'extractelement_v3i32' -; GCN: estimated cost of 0 for {{.*}} extractelement <3 x i32> -define amdgpu_kernel void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) { - %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr - %elt = extractelement <3 x i32> %vec, i32 1 - store i32 %elt, i32 addrspace(1)* %out - ret void -} - -; GCN: 'extractelement_v4i32' -; GCN: estimated cost of 0 for {{.*}} extractelement <4 x i32> -define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) { - %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr - %elt = extractelement <4 x i32> %vec, i32 1 - store i32 %elt, i32 addrspace(1)* %out - ret void -} - -; GCN: 'extractelement_v5i32' -; GCN: estimated cost of 0 for {{.*}} extractelement <5 x i32> -define amdgpu_kernel void @extractelement_v5i32(i32 addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr) { - %vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr - %elt = extractelement <5 x i32> %vec, i32 1 - store i32 %elt, i32 addrspace(1)* %out - ret void -} - -; GCN: 'extractelement_v8i32' -; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i32> -define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) { - %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr - %elt = extractelement <8 x i32> %vec, i32 1 - store i32 %elt, i32 addrspace(1)* %out - ret void -} - -; FIXME: Should be non-0 -; GCN: 'extractelement_v8i32_dynindex' -; GCN: estimated cost of 2 for {{.*}} extractelement <8 x i32> -define amdgpu_kernel void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) { - %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr - %elt = extractelement <8 x i32> %vec, i32 %idx - store i32 %elt, i32 addrspace(1)* %out - ret void -} - -; GCN: 'extractelement_v2i64' -; GCN: estimated cost of 0 for {{.*}} extractelement <2 x i64> -define amdgpu_kernel void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) { - %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr - %elt = extractelement <2 x i64> %vec, i64 1 - store i64 %elt, i64 addrspace(1)* %out - ret void -} - -; GCN: 'extractelement_v3i64' -; GCN: estimated cost of 0 for {{.*}} extractelement <3 x i64> -define amdgpu_kernel void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) { - %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr - %elt = extractelement <3 x i64> %vec, i64 1 - store i64 %elt, i64 addrspace(1)* %out - ret void -} - -; GCN: 'extractelement_v4i64' -; GCN: estimated cost of 0 for {{.*}} extractelement <4 x i64> -define amdgpu_kernel void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) { - %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr - %elt = extractelement <4 x i64> %vec, i64 1 - store i64 %elt, i64 addrspace(1)* %out - ret void -} - -; GCN: 'extractelement_v8i64' -; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i64> -define amdgpu_kernel void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) { - %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr - %elt = extractelement <8 x i64> %vec, i64 1 - store i64 %elt, i64 addrspace(1)* %out - ret void -} - -; GCN: 'extractelement_v4i8' -; GCN: estimated cost of 1 for {{.*}} extractelement <4 x i8> -define amdgpu_kernel void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) { - %vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr - %elt = extractelement <4 x i8> %vec, i8 1 - store i8 %elt, i8 addrspace(1)* %out - ret void -} - -; GCN: 'extractelement_0_v2i16': -; CI: estimated cost of 1 for {{.*}} extractelement <2 x i16> %vec, i16 0 -; GFX89: estimated cost of 0 for {{.*}} extractelement <2 x i16> -define amdgpu_kernel void @extractelement_0_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { - %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr - %elt = extractelement <2 x i16> %vec, i16 0 - store i16 %elt, i16 addrspace(1)* %out - ret void -} - -; GCN: 'extractelement_1_v2i16': -; GCN: estimated cost of 1 for {{.*}} extractelement <2 x i16> -define amdgpu_kernel void @extractelement_1_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { - %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr - %elt = extractelement <2 x i16> %vec, i16 1 - store i16 %elt, i16 addrspace(1)* %out - ret void -} - -; GCN: 'extractelement_var_v2i16' -; GCN: estimated cost of 1 for {{.*}} extractelement <2 x i16> -define amdgpu_kernel void @extractelement_var_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, i32 %idx) { - %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr - %elt = extractelement <2 x i16> %vec, i32 %idx - store i16 %elt, i16 addrspace(1)* %out +; END. + +; GCN-LABEL: 'extractelement_32' +; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <2 x i32> +; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <2 x float> +; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <3 x i32> +; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <4 x i32> +; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <5 x i32> +; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <8 x i32> +; GCN-NEXT: estimated cost of 2 for {{.*}} extractelement <8 x i32> +define amdgpu_kernel void @extractelement_32(i32 %arg) { + %v2i32_1 = extractelement <2 x i32> undef, i32 1 + %v2f32_1 = extractelement <2 x float> undef, i32 1 + %v3i32_1 = extractelement <3 x i32> undef, i32 1 + %v4i32_1 = extractelement <4 x i32> undef, i32 1 + %v5i32_1 = extractelement <5 x i32> undef, i32 1 + %v8i32_1 = extractelement <8 x i32> undef, i32 1 + %v8i32_a = extractelement <8 x i32> undef, i32 %arg + ret void +} + +; GCN-LABEL: 'extractelement_64' +; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <2 x i64> +; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <3 x i64> +; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <4 x i64> +; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <8 x i64> +define amdgpu_kernel void @extractelement_64() { + %v2i64_1 = extractelement <2 x i64> undef, i64 1 + %v3i64_1 = extractelement <3 x i64> undef, i64 1 + %v4i64_1 = extractelement <4 x i64> undef, i64 1 + %v8i64_1 = extractelement <8 x i64> undef, i64 1 + ret void +} + +; GCN-LABEL: 'extractelement_8' +; GCN-NEXT: estimated cost of 1 for {{.*}} extractelement <4 x i8> +define amdgpu_kernel void @extractelement_8() { + %v4i8_1 = extractelement <4 x i8> undef, i8 1 + ret void +} + +; GCN-LABEL: 'extractelement_16' +; CI-NEXT: estimated cost of 1 for {{.*}} extractelement <2 x i16> undef, i16 0 +; GFX89-NEXT: estimated cost of 0 for {{.*}} extractelement <2 x i16> +; GCN-NEXT: estimated cost of 1 for {{.*}} extractelement <2 x i16> +; GCN-NEXT: estimated cost of 1 for {{.*}} extractelement <2 x i16> +define amdgpu_kernel void @extractelement_16(i32 %arg) { + %v2i16_0 = extractelement <2 x i16> undef, i16 0 + %v2i16_1 = extractelement <2 x i16> undef, i16 1 + %v2i16_a = extractelement <2 x i16> undef, i32 %arg ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll b/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll index de5381c2102ae..a616d455ce80c 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll @@ -1,93 +1,39 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s +; END. ; CHECK-LABEL: 'fabs_f32' ; CHECK: estimated cost of 0 for {{.*}} call float @llvm.fabs.f32 -define amdgpu_kernel void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 { - %vec = load float, float addrspace(1)* %vaddr - %fabs = call float @llvm.fabs.f32(float %vec) #1 - store float %fabs, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: 'fabs_v2f32' ; CHECK: estimated cost of 0 for {{.*}} call <2 x float> @llvm.fabs.v2f32 -define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 { - %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr - %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %vec) #1 - store <2 x float> %fabs, <2 x float> addrspace(1)* %out - ret void -} - -; CHECK-LABEL: 'fabs_v3f32' ; CHECK: estimated cost of 0 for {{.*}} call <3 x float> @llvm.fabs.v3f32 -define amdgpu_kernel void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 { - %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr - %fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %vec) #1 - store <3 x float> %fabs, <3 x float> addrspace(1)* %out - ret void -} - -; CHECK-LABEL: 'fabs_v5f32' ; CHECK: estimated cost of 0 for {{.*}} call <5 x float> @llvm.fabs.v5f32 -define amdgpu_kernel void @fabs_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) #0 { - %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr - %fabs = call <5 x float> @llvm.fabs.v5f32(<5 x float> %vec) #1 - store <5 x float> %fabs, <5 x float> addrspace(1)* %out +define amdgpu_kernel void @fabs_f32() #0 { + %f32 = call float @llvm.fabs.f32(float undef) #1 + %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef) #1 + %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef) #1 + %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef) #1 ret void } ; CHECK-LABEL: 'fabs_f64' ; CHECK: estimated cost of 0 for {{.*}} call double @llvm.fabs.f64 -define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 { - %vec = load double, double addrspace(1)* %vaddr - %fabs = call double @llvm.fabs.f64(double %vec) #1 - store double %fabs, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: 'fabs_v2f64' ; CHECK: estimated cost of 0 for {{.*}} call <2 x double> @llvm.fabs.v2f64 -define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 { - %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr - %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %vec) #1 - store <2 x double> %fabs, <2 x double> addrspace(1)* %out - ret void -} - -; CHECK-LABEL: 'fabs_v3f64' ; CHECK: estimated cost of 0 for {{.*}} call <3 x double> @llvm.fabs.v3f64 -define amdgpu_kernel void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 { - %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr - %fabs = call <3 x double> @llvm.fabs.v3f64(<3 x double> %vec) #1 - store <3 x double> %fabs, <3 x double> addrspace(1)* %out +define amdgpu_kernel void @fabs_f64() #0 { + %f64 = call double @llvm.fabs.f64(double undef) #1 + %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef) #1 + %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef) #1 ret void } ; CHECK-LABEL: 'fabs_f16' ; CHECK: estimated cost of 0 for {{.*}} call half @llvm.fabs.f16 -define amdgpu_kernel void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 { - %vec = load half, half addrspace(1)* %vaddr - %fabs = call half @llvm.fabs.f16(half %vec) #1 - store half %fabs, half addrspace(1)* %out - ret void -} - -; CHECK-LABEL: 'fabs_v2f16' ; CHECK: estimated cost of 0 for {{.*}} call <2 x half> @llvm.fabs.v2f16 -define amdgpu_kernel void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 { - %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr - %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %vec) #1 - store <2 x half> %fabs, <2 x half> addrspace(1)* %out - ret void -} - -; CHECK-LABEL: 'fabs_v3f16' ; CHECK: estimated cost of 0 for {{.*}} call <3 x half> @llvm.fabs.v3f16 -define amdgpu_kernel void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 { - %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr - %fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %vec) #1 - store <3 x half> %fabs, <3 x half> addrspace(1)* %out +define amdgpu_kernel void @fabs_f16() #0 { + %f16 = call half @llvm.fabs.f16(half undef) #1 + %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef) #1 + %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef) #1 ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll index 8eb1a07691707..b79a09c2c31f3 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll @@ -3,47 +3,25 @@ ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,SIZEALL,ALL %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF16,SIZEALL,ALL %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX90A-FASTF64,FASTF16,PACKEDF32,ALL %s +; END. ; ALL-LABEL: 'fadd_f32' ; ALL: estimated cost of 1 for {{.*}} fadd float -define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { - %vec = load float, float addrspace(1)* %vaddr - %add = fadd float %vec, %b - store float %add, float addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fadd_v2f32' ; NOPACKEDF32: estimated cost of 2 for {{.*}} fadd <2 x float> ; PACKEDF32: estimated cost of 1 for {{.*}} fadd <2 x float> -define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { - %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr - %add = fadd <2 x float> %vec, %b - store <2 x float> %add, <2 x float> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fadd_v3f32' ; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening, ; and 3 when it is legal. ; NOPACKEDF32: estimated cost of {{[34]}} for {{.*}} fadd <3 x float> ; PACKEDF32: estimated cost of 2 for {{.*}} fadd <3 x float> -define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { - %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr - %add = fadd <3 x float> %vec, %b - store <3 x float> %add, <3 x float> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fadd_v5f32' ; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening, ; and 5 when it is legal. ; NOPACKEDF32: estimated cost of {{[58]}} for {{.*}} fadd <5 x float> ; PACKEDF32: estimated cost of 3 for {{.*}} fadd <5 x float> -define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { - %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr - %add = fadd <5 x float> %vec, %b - store <5 x float> %add, <5 x float> addrspace(1)* %out +define amdgpu_kernel void @fadd_f32() #0 { + %f32 = fadd float undef, undef + %v2f32 = fadd <2 x float> undef, undef + %v3f32 = fadd <3 x float> undef, undef + %v5f32 = fadd <5 x float> undef, undef ret void } @@ -52,73 +30,34 @@ define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float ; FASTF64: estimated cost of 2 for {{.*}} fadd double ; SLOWF64: estimated cost of 4 for {{.*}} fadd double ; SIZEALL: estimated cost of 2 for {{.*}} fadd double -define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { - %vec = load double, double addrspace(1)* %vaddr - %add = fadd double %vec, %b - store double %add, double addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fadd_v2f64' ; GFX90A-FASTF64: estimated cost of 2 for {{.*}} fadd <2 x double> ; FASTF64: estimated cost of 4 for {{.*}} fadd <2 x double> ; SLOWF64: estimated cost of 8 for {{.*}} fadd <2 x double> ; SIZEALL: estimated cost of 4 for {{.*}} fadd <2 x double> -define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { - %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr - %add = fadd <2 x double> %vec, %b - store <2 x double> %add, <2 x double> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fadd_v3f64' ; GFX90A-FASTF64: estimated cost of 3 for {{.*}} fadd <3 x double> ; FASTF64: estimated cost of 6 for {{.*}} fadd <3 x double> ; SLOWF64: estimated cost of 12 for {{.*}} fadd <3 x double> ; SIZEALL: estimated cost of 6 for {{.*}} fadd <3 x double> -define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { - %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr - %add = fadd <3 x double> %vec, %b - store <3 x double> %add, <3 x double> addrspace(1)* %out +define amdgpu_kernel void @fadd_f64() #0 { + %f64 = fadd double undef, undef + %v2f64 = fadd <2 x double> undef, undef + %v3f64 = fadd <3 x double> undef, undef ret void } ; ALL-LABEL: 'fadd_f16' ; ALL: estimated cost of 1 for {{.*}} fadd half -define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { - %vec = load half, half addrspace(1)* %vaddr - %add = fadd half %vec, %b - store half %add, half addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fadd_v2f16' ; SLOWF16: estimated cost of 2 for {{.*}} fadd <2 x half> ; FASTF16: estimated cost of 1 for {{.*}} fadd <2 x half> -define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { - %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr - %add = fadd <2 x half> %vec, %b - store <2 x half> %add, <2 x half> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fadd_v3f16' ; SLOWF16: estimated cost of 4 for {{.*}} fadd <3 x half> ; FASTF16: estimated cost of 2 for {{.*}} fadd <3 x half> -define amdgpu_kernel void @fadd_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 { - %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr - %add = fadd <3 x half> %vec, %b - store <3 x half> %add, <3 x half> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fadd_v4f16' ; SLOWF16: estimated cost of 4 for {{.*}} fadd <4 x half> ; FASTF16: estimated cost of 2 for {{.*}} fadd <4 x half> -define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { - %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr - %add = fadd <4 x half> %vec, %b - store <4 x half> %add, <4 x half> addrspace(1)* %out +define amdgpu_kernel void @fadd_f16() #0 { + %f16 = fadd half undef, undef + %v2f16 = fadd <2 x half> undef, undef + %v3f16 = fadd <3 x half> undef, undef + %v4f16 = fadd <4 x half> undef, undef ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll index 883db92932a8f..d4836a9d69049 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll @@ -9,84 +9,39 @@ ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZESI,SIZENOF16 %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZESI,SIZENOF16 %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZECI,SIZEF16 %s +; END. ; ALL-LABEL: 'fdiv_f32_ieee' ; THRPTALL: estimated cost of 14 for {{.*}} fdiv float +; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float> +; THRPTALL: estimated cost of 42 for {{.*}} fdiv <3 x float> +; THRPTALL: estimated cost of 70 for {{.*}} fdiv <5 x float> ; SIZEALL: estimated cost of 12 for {{.*}} fdiv float -define amdgpu_kernel void @fdiv_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { - %vec = load float, float addrspace(1)* %vaddr - %add = fdiv float %vec, %b - store float %add, float addrspace(1)* %out +; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float> +; SIZEALL: estimated cost of 36 for {{.*}} fdiv <3 x float> +; SIZEALL: estimated cost of 60 for {{.*}} fdiv <5 x float> +define amdgpu_kernel void @fdiv_f32_ieee() #0 { + %f32 = fdiv float undef, undef + %v2f32 = fdiv <2 x float> undef, undef + %v3f32 = fdiv <3 x float> undef, undef + %v5f32 = fdiv <5 x float> undef, undef ret void } ; ALL-LABEL: 'fdiv_f32_ftzdaz' ; THRPTALL: estimated cost of 16 for {{.*}} fdiv float ; SIZEALL: estimated cost of 14 for {{.*}} fdiv float -define amdgpu_kernel void @fdiv_f32_ftzdaz(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #1 { - %vec = load float, float addrspace(1)* %vaddr - %add = fdiv float %vec, %b - store float %add, float addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fdiv_v2f32_ieee' -; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float> -; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float> -define amdgpu_kernel void @fdiv_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { - %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr - %add = fdiv <2 x float> %vec, %b - store <2 x float> %add, <2 x float> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fdiv_v2f32_ftzdaz' ; THRPTALL: estimated cost of 32 for {{.*}} fdiv <2 x float> ; SIZEALL: estimated cost of 28 for {{.*}} fdiv <2 x float> -define amdgpu_kernel void @fdiv_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #1 { - %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr - %add = fdiv <2 x float> %vec, %b - store <2 x float> %add, <2 x float> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fdiv_v3f32_ieee' -; THRPTALL: estimated cost of 42 for {{.*}} fdiv <3 x float> -; SIZEALL: estimated cost of 36 for {{.*}} fdiv <3 x float> -define amdgpu_kernel void @fdiv_v3f32_ieee(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { - %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr - %add = fdiv <3 x float> %vec, %b - store <3 x float> %add, <3 x float> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fdiv_v3f32_ftzdaz' ; THRPTALL: estimated cost of 48 for {{.*}} fdiv <3 x float> ; SIZEALL: estimated cost of 42 for {{.*}} fdiv <3 x float> -define amdgpu_kernel void @fdiv_v3f32_ftzdaz(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #1 { - %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr - %add = fdiv <3 x float> %vec, %b - store <3 x float> %add, <3 x float> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fdiv_v5f32_ieee' -; THRPTALL: estimated cost of 70 for {{.*}} fdiv <5 x float> -; SIZEALL: estimated cost of 60 for {{.*}} fdiv <5 x float> -define amdgpu_kernel void @fdiv_v5f32_ieee(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { - %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr - %add = fdiv <5 x float> %vec, %b - store <5 x float> %add, <5 x float> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fdiv_v5f32_ftzdaz' ; THRPTALL: estimated cost of 80 for {{.*}} fdiv <5 x float> ; SIZEALL: estimated cost of 70 for {{.*}} fdiv <5 x float> -define amdgpu_kernel void @fdiv_v5f32_ftzdaz(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #1 { - %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr - %add = fdiv <5 x float> %vec, %b - store <5 x float> %add, <5 x float> addrspace(1)* %out +define amdgpu_kernel void @fdiv_f32_ftzdaz() #1 { + %f32 = fdiv float undef, undef + %v2f32 = fdiv <2 x float> undef, undef + %v3f32 = fdiv <3 x float> undef, undef + %v5f32 = fdiv <5 x float> undef, undef ret void } @@ -97,208 +52,107 @@ define amdgpu_kernel void @fdiv_v5f32_ftzdaz(<5 x float> addrspace(1)* %out, <5 ; SISLOWF64: estimated cost of 41 for {{.*}} fdiv double ; SIZECI: estimated cost of 22 for {{.*}} fdiv double ; SIZESI: estimated cost of 25 for {{.*}} fdiv double -define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { - %vec = load double, double addrspace(1)* %vaddr - %add = fdiv double %vec, %b - store double %add, double addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fdiv_v2f64' ; CIFASTF64: estimated cost of 48 for {{.*}} fdiv <2 x double> ; CISLOWF64: estimated cost of 76 for {{.*}} fdiv <2 x double> ; SIFASTF64: estimated cost of 54 for {{.*}} fdiv <2 x double> ; SISLOWF64: estimated cost of 82 for {{.*}} fdiv <2 x double> ; SIZECI: estimated cost of 44 for {{.*}} fdiv <2 x double> ; SIZESI: estimated cost of 50 for {{.*}} fdiv <2 x double> -define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { - %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr - %add = fdiv <2 x double> %vec, %b - store <2 x double> %add, <2 x double> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fdiv_v3f64' ; CIFASTF64: estimated cost of 72 for {{.*}} fdiv <3 x double> ; CISLOWF64: estimated cost of 114 for {{.*}} fdiv <3 x double> ; SIFASTF64: estimated cost of 81 for {{.*}} fdiv <3 x double> ; SISLOWF64: estimated cost of 123 for {{.*}} fdiv <3 x double> ; SIZECI: estimated cost of 66 for {{.*}} fdiv <3 x double> ; SIZESI: estimated cost of 75 for {{.*}} fdiv <3 x double> -define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { - %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr - %add = fdiv <3 x double> %vec, %b - store <3 x double> %add, <3 x double> addrspace(1)* %out +define amdgpu_kernel void @fdiv_f64() #0 { + %f64 = fdiv double undef, undef + %v2f64 = fdiv <2 x double> undef, undef + %v3f64 = fdiv <3 x double> undef, undef ret void } -; ALL-LABEL: 'fdiv_f16_f32_ieee' +; ALL-LABEL: 'fdiv_f16_f32ieee' ; NOFP16: estimated cost of 14 for {{.*}} fdiv half ; FP16: estimated cost of 12 for {{.*}} fdiv half ; SIZENOF16: estimated cost of 12 for {{.*}} fdiv half ; SIZEF16: estimated cost of 8 for {{.*}} fdiv half -define amdgpu_kernel void @fdiv_f16_f32_ieee(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { - %vec = load half, half addrspace(1)* %vaddr - %add = fdiv half %vec, %b - store half %add, half addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fdiv_f16_f32_ftzdaz' -; NOFP16: estimated cost of 16 for {{.*}} fdiv half -; FP16: estimated cost of 12 for {{.*}} fdiv half -; SIZENOF16: estimated cost of 14 for {{.*}} fdiv half -; SIZEF16: estimated cost of 8 for {{.*}} fdiv half -define amdgpu_kernel void @fdiv_f16_f32_ftzdaz(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #1 { - %vec = load half, half addrspace(1)* %vaddr - %add = fdiv half %vec, %b - store half %add, half addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fdiv_v2f16_f32_ieee' ; NOFP16: estimated cost of 28 for {{.*}} fdiv <2 x half> ; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half> ; SIZENOF16: estimated cost of 24 for {{.*}} fdiv <2 x half> ; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half> -define amdgpu_kernel void @fdiv_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { - %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr - %add = fdiv <2 x half> %vec, %b - store <2 x half> %add, <2 x half> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fdiv_v2f16_f32_ftzdaz' -; NOFP16: estimated cost of 32 for {{.*}} fdiv <2 x half> -; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half> -; SIZENOF16: estimated cost of 28 for {{.*}} fdiv <2 x half> -; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half> -define amdgpu_kernel void @fdiv_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #1 { - %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr - %add = fdiv <2 x half> %vec, %b - store <2 x half> %add, <2 x half> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fdiv_v4f16_f32_ieee' ; NOFP16: estimated cost of 56 for {{.*}} fdiv <4 x half> ; FP16: estimated cost of 48 for {{.*}} fdiv <4 x half> ; SIZENOF16: estimated cost of 48 for {{.*}} fdiv <4 x half> ; SIZEF16: estimated cost of 32 for {{.*}} fdiv <4 x half> -define amdgpu_kernel void @fdiv_v4f16_f32_ieee(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { - %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr - %add = fdiv <4 x half> %vec, %b - store <4 x half> %add, <4 x half> addrspace(1)* %out +define amdgpu_kernel void @fdiv_f16_f32ieee() #0 { + %f16 = fdiv half undef, undef + %v2f16 = fdiv <2 x half> undef, undef + %v4f16 = fdiv <4 x half> undef, undef ret void } -; ALL-LABEL: 'fdiv_v4f16_f32_ftzdaz' +; ALL-LABEL: 'fdiv_f16_f32ftzdaz' +; NOFP16: estimated cost of 16 for {{.*}} fdiv half +; FP16: estimated cost of 12 for {{.*}} fdiv half +; SIZENOF16: estimated cost of 14 for {{.*}} fdiv half +; SIZEF16: estimated cost of 8 for {{.*}} fdiv half +; NOFP16: estimated cost of 32 for {{.*}} fdiv <2 x half> +; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half> +; SIZENOF16: estimated cost of 28 for {{.*}} fdiv <2 x half> +; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half> ; NOFP16: estimated cost of 64 for {{.*}} fdiv <4 x half> ; FP16: estimated cost of 48 for {{.*}} fdiv <4 x half> ; SIZENOF16: estimated cost of 56 for {{.*}} fdiv <4 x half> ; SIZEF16: estimated cost of 32 for {{.*}} fdiv <4 x half> -define amdgpu_kernel void @fdiv_v4f16_f32_ftzdaz(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #1 { - %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr - %add = fdiv <4 x half> %vec, %b - store <4 x half> %add, <4 x half> addrspace(1)* %out +define amdgpu_kernel void @fdiv_f16_f32ftzdaz() #1 { + %f16 = fdiv half undef, undef + %v2f16 = fdiv <2 x half> undef, undef + %v4f16 = fdiv <4 x half> undef, undef ret void } -; ALL-LABEL: 'rcp_f32_ieee' +; ALL-LABEL: 'rcp_ieee' ; THRPTALL: estimated cost of 14 for {{.*}} fdiv float ; SIZEALL: estimated cost of 12 for {{.*}} fdiv float -define amdgpu_kernel void @rcp_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 { - %vec = load float, float addrspace(1)* %vaddr - %add = fdiv float 1.0, %vec - store float %add, float addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'rcp_f32_ftzdaz' -; THRPTALL: estimated cost of 4 for {{.*}} fdiv float -; SIZEALL: estimated cost of 2 for {{.*}} fdiv float -define amdgpu_kernel void @rcp_f32_ftzdaz(float addrspace(1)* %out, float addrspace(1)* %vaddr) #1 { - %vec = load float, float addrspace(1)* %vaddr - %add = fdiv float 1.0, %vec - store float %add, float addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'rcp_f16_f32_ieee' ; NOFP16: estimated cost of 14 for {{.*}} fdiv half ; FP16: estimated cost of 4 for {{.*}} fdiv half ; SIZENOF16: estimated cost of 12 for {{.*}} fdiv half ; SIZEF16: estimated cost of 2 for {{.*}} fdiv half -define amdgpu_kernel void @rcp_f16_f32_ieee(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 { - %vec = load half, half addrspace(1)* %vaddr - %add = fdiv half 1.0, %vec - store half %add, half addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'rcp_f16_f32_ftzdaz' -; THRPTALL: estimated cost of 4 for {{.*}} fdiv half -; SIZEALL: estimated cost of 2 for {{.*}} fdiv half -define amdgpu_kernel void @rcp_f16_f32_ftzdaz(half addrspace(1)* %out, half addrspace(1)* %vaddr) #1 { - %vec = load half, half addrspace(1)* %vaddr - %add = fdiv half 1.0, %vec - store half %add, half addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'rcp_f64' ; CIFASTF64: estimated cost of 24 for {{.*}} fdiv double ; CISLOWF64: estimated cost of 38 for {{.*}} fdiv double ; SIFASTF64: estimated cost of 27 for {{.*}} fdiv double ; SISLOWF64: estimated cost of 41 for {{.*}} fdiv double ; SIZECI: estimated cost of 22 for {{.*}} fdiv double ; SIZESI: estimated cost of 25 for {{.*}} fdiv double -define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 { - %vec = load double, double addrspace(1)* %vaddr - %add = fdiv double 1.0, %vec - store double %add, double addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'rcp_v2f32_ieee' ; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float> ; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float> -define amdgpu_kernel void @rcp_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 { - %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr - %add = fdiv <2 x float> , %vec - store <2 x float> %add, <2 x float> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'rcp_v2f32_ftzdaz' -; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x float> -; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x float> -define amdgpu_kernel void @rcp_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #1 { - %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr - %add = fdiv <2 x float> , %vec - store <2 x float> %add, <2 x float> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'rcp_v2f16_f32_ieee' ; NOFP16: estimated cost of 28 for {{.*}} fdiv <2 x half> ; FP16: estimated cost of 8 for {{.*}} fdiv <2 x half> ; SIZENOF16: estimated cost of 24 for {{.*}} fdiv <2 x half> ; SIZEF16: estimated cost of 4 for {{.*}} fdiv <2 x half> -define amdgpu_kernel void @rcp_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 { - %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr - %add = fdiv <2 x half> , %vec - store <2 x half> %add, <2 x half> addrspace(1)* %out +define amdgpu_kernel void @rcp_ieee() #0 { + %f32 = fdiv float 1.0, undef + %f16 = fdiv half 1.0, undef + %f64 = fdiv double 1.0, undef + %v2f32 = fdiv <2 x float> , undef + %v2f16 = fdiv <2 x half> , undef ret void } -; ALL-LABEL: 'rcp_v2f16_f32_ftzdaz' +; ALL-LABEL: 'rcp_ftzdaz' +; THRPTALL: estimated cost of 4 for {{.*}} fdiv float +; SIZEALL: estimated cost of 2 for {{.*}} fdiv float +; THRPTALL: estimated cost of 4 for {{.*}} fdiv half +; SIZEALL: estimated cost of 2 for {{.*}} fdiv half +; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x float> +; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x float> ; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x half> ; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x half> -define amdgpu_kernel void @rcp_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #1 { - %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr - %add = fdiv <2 x half> , %vec - store <2 x half> %add, <2 x half> addrspace(1)* %out +define amdgpu_kernel void @rcp_ftzdaz() #1 { + %f32 = fdiv float 1.0, undef + %f16 = fdiv half 1.0, undef + %v2f32 = fdiv <2 x float> , undef + %v2f16 = fdiv <2 x half> , undef ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll index c90ca1412effa..1758663ffeff5 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll @@ -3,48 +3,26 @@ ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZEF16 %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZENOF16 %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX90A-FASTF64,FASTF16,PACKEDF32,ALL %s +; END. ; ALL-LABEL: 'fma_f32' ; SLOWF32: estimated cost of 4 for {{.*}} call float @llvm.fma.f32 ; FASTF32: estimated cost of 2 for {{.*}} call float @llvm.fma.f32 ; SIZEALL: estimated cost of 2 for {{.*}} call float @llvm.fma.f32 -define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 { - %vec = load float, float addrspace(1)* %vaddr - %fma = call float @llvm.fma.f32(float %vec, float %vec, float %vec) #1 - store float %fma, float addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fma_v2f32' ; SLOWF32: estimated cost of 8 for {{.*}} call <2 x float> @llvm.fma.v2f32 ; PACKEDF32: estimated cost of 2 for {{.*}} call <2 x float> @llvm.fma.v2f32 ; SIZEALL: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32 -define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 { - %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr - %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %vec, <2 x float> %vec, <2 x float> %vec) #1 - store <2 x float> %fma, <2 x float> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fma_v3f32' ; SLOWF32: estimated cost of 12 for {{.*}} call <3 x float> @llvm.fma.v3f32 ; PACKEDF32: estimated cost of 4 for {{.*}} call <3 x float> @llvm.fma.v3f32 ; SIZEALL: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32 -define amdgpu_kernel void @fma_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 { - %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr - %fma = call <3 x float> @llvm.fma.v3f32(<3 x float> %vec, <3 x float> %vec, <3 x float> %vec) #1 - store <3 x float> %fma, <3 x float> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fma_v5f32' ; SLOWF32: estimated cost of 20 for {{.*}} call <5 x float> @llvm.fma.v5f32 ; PACKEDF32: estimated cost of 6 for {{.*}} call <5 x float> @llvm.fma.v5f32 ; SIZEALL: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32 -define amdgpu_kernel void @fma_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) #0 { - %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr - %fma = call <5 x float> @llvm.fma.v5f32(<5 x float> %vec, <5 x float> %vec, <5 x float> %vec) #1 - store <5 x float> %fma, <5 x float> addrspace(1)* %out +define amdgpu_kernel void @fma_f32() #0 { + %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #1 + %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #1 + %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #1 + %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #1 ret void } @@ -53,33 +31,17 @@ define amdgpu_kernel void @fma_v5f32(<5 x float> addrspace(1)* %out, <5 x float> ; GFX90A-FASTF64: estimated cost of 1 for {{.*}} call double @llvm.fma.f64 ; FASTF64: estimated cost of 2 for {{.*}} call double @llvm.fma.f64 ; SIZEALL: estimated cost of 2 for {{.*}} call double @llvm.fma.f64 -define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 { - %vec = load double, double addrspace(1)* %vaddr - %fma = call double @llvm.fma.f64(double %vec, double %vec, double %vec) #1 - store double %fma, double addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fma_v2f64' ; SLOWF64: estimated cost of 8 for {{.*}} call <2 x double> @llvm.fma.v2f64 ; GFX90A-FASTF64: estimated cost of 2 for {{.*}} call <2 x double> @llvm.fma.v2f64 ; FASTF64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64 ; SIZEALL: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64 -define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 { - %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr - %fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %vec, <2 x double> %vec, <2 x double> %vec) #1 - store <2 x double> %fma, <2 x double> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fma_v3f64' ; SLOWF64: estimated cost of 12 for {{.*}} call <3 x double> @llvm.fma.v3f64 ; FASTF64: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64 ; SIZEALL: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64 -define amdgpu_kernel void @fma_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 { - %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr - %fma = call <3 x double> @llvm.fma.v3f64(<3 x double> %vec, <3 x double> %vec, <3 x double> %vec) #1 - store <3 x double> %fma, <3 x double> addrspace(1)* %out +define amdgpu_kernel void @fma_f64() #0 { + %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #1 + %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #1 + %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #1 ret void } @@ -87,34 +49,18 @@ define amdgpu_kernel void @fma_v3f64(<3 x double> addrspace(1)* %out, <3 x doubl ; SLOWF16: estimated cost of 4 for {{.*}} call half @llvm.fma.f16 ; FASTF16: estimated cost of 2 for {{.*}} call half @llvm.fma.f16 ; SIZEALL: estimated cost of 2 for {{.*}} call half @llvm.fma.f16 -define amdgpu_kernel void @fma_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 { - %vec = load half, half addrspace(1)* %vaddr - %fma = call half @llvm.fma.f16(half %vec, half %vec, half %vec) #1 - store half %fma, half addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fma_v2f16' ; SLOWF16: estimated cost of 8 for {{.*}} call <2 x half> @llvm.fma.v2f16 ; FASTF16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16 ; SIZEF16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16 ; SIZENOF16: estimated cost of 4 for {{.*}} call <2 x half> @llvm.fma.v2f16 -define amdgpu_kernel void @fma_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 { - %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr - %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %vec, <2 x half> %vec, <2 x half> %vec) #1 - store <2 x half> %fma, <2 x half> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fma_v3f16' ; SLOWF16: estimated cost of 16 for {{.*}} call <3 x half> @llvm.fma.v3f16 ; FASTF16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16 ; SIZEF16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16 ; SIZENOF16: estimated cost of 8 for {{.*}} call <3 x half> @llvm.fma.v3f16 -define amdgpu_kernel void @fma_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 { - %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr - %fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %vec, <3 x half> %vec, <3 x half> %vec) #1 - store <3 x half> %fma, <3 x half> addrspace(1)* %out +define amdgpu_kernel void @fma_f16() #0 { + %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #1 + %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #1 + %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #1 ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll index 929a51229e5c1..75c5e76be7411 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll @@ -3,47 +3,25 @@ ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,FASTF16 %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SLOWF16 %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX90A-FASTF64,FASTF16,PACKEDF32,ALL %s +; END. ; ALL-LABEL: 'fmul_f32' ; ALL: estimated cost of 1 for {{.*}} fmul float -define amdgpu_kernel void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { - %vec = load float, float addrspace(1)* %vaddr - %add = fmul float %vec, %b - store float %add, float addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fmul_v2f32' ; NOPACKEDF32: estimated cost of 2 for {{.*}} fmul <2 x float> ; PACKEDF32: estimated cost of 1 for {{.*}} fmul <2 x float> -define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { - %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr - %add = fmul <2 x float> %vec, %b - store <2 x float> %add, <2 x float> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fmul_v3f32' -; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening, -; and 3 when it is legal. +;;; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening, +;;; and 3 when it is legal. ; NOPACKEDF32: estimated cost of {{[34]}} for {{.*}} fmul <3 x float> ; PACKEDF32: estimated cost of 2 for {{.*}} fmul <3 x float> -define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { - %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr - %add = fmul <3 x float> %vec, %b - store <3 x float> %add, <3 x float> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fmul_v5f32' -; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening, -; and 5 when it is legal. +;;; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening, +;;; and 5 when it is legal. ; NOPACKEDF32: estimated cost of {{[58]}} for {{.*}} fmul <5 x float> ; PACKEDF32: estimated cost of 3 for {{.*}} fmul <5 x float> -define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { - %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr - %add = fmul <5 x float> %vec, %b - store <5 x float> %add, <5 x float> addrspace(1)* %out +define amdgpu_kernel void @fmul_f32() #0 { + %f32 = fmul float undef, undef + %v2f32 = fmul <2 x float> undef, undef + %v3f32 = fmul <3 x float> undef, undef + %v5f32 = fmul <5 x float> undef, undef ret void } @@ -52,71 +30,32 @@ define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float ; FASTF64: estimated cost of 2 for {{.*}} fmul double ; SLOWF64: estimated cost of 4 for {{.*}} fmul double ; SIZEALL: estimated cost of 2 for {{.*}} fmul double -define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { - %vec = load double, double addrspace(1)* %vaddr - %add = fmul double %vec, %b - store double %add, double addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fmul_v2f64' ; FASTF64: estimated cost of 4 for {{.*}} fmul <2 x double> ; SLOWF64: estimated cost of 8 for {{.*}} fmul <2 x double> ; SIZEALL: estimated cost of 4 for {{.*}} fmul <2 x double> -define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { - %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr - %add = fmul <2 x double> %vec, %b - store <2 x double> %add, <2 x double> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fmul_v3f64' ; FASTF64: estimated cost of 6 for {{.*}} fmul <3 x double> ; SLOWF64: estimated cost of 12 for {{.*}} fmul <3 x double> ; SIZEALL: estimated cost of 6 for {{.*}} fmul <3 x double> -define amdgpu_kernel void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { - %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr - %add = fmul <3 x double> %vec, %b - store <3 x double> %add, <3 x double> addrspace(1)* %out +define amdgpu_kernel void @fmul_f64() #0 { + %f64 = fmul double undef, undef + %v2f64 = fmul <2 x double> undef, undef + %v3f64 = fmul <3 x double> undef, undef ret void } ; ALL-LABEL: 'fmul_f16' ; ALL: estimated cost of 1 for {{.*}} fmul half -define amdgpu_kernel void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { - %vec = load half, half addrspace(1)* %vaddr - %add = fmul half %vec, %b - store half %add, half addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fmul_v2f16' ; SLOWF16: estimated cost of 2 for {{.*}} fmul <2 x half> ; FASTF16: estimated cost of 1 for {{.*}} fmul <2 x half> -define amdgpu_kernel void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { - %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr - %add = fmul <2 x half> %vec, %b - store <2 x half> %add, <2 x half> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fmul_v3f16' ; SLOWF16: estimated cost of 4 for {{.*}} fmul <3 x half> ; FASTF16: estimated cost of 2 for {{.*}} fmul <3 x half> -define amdgpu_kernel void @fmul_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 { - %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr - %add = fmul <3 x half> %vec, %b - store <3 x half> %add, <3 x half> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fmul_v4f16' ; SLOWF16: estimated cost of 4 for {{.*}} fmul <4 x half> ; FASTF16: estimated cost of 2 for {{.*}} fmul <4 x half> -define amdgpu_kernel void @fmul_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { - %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr - %add = fmul <4 x half> %vec, %b - store <4 x half> %add, <4 x half> addrspace(1)* %out +define amdgpu_kernel void @fmul_f16() #0 { + %f16 = fmul half undef, undef + %v2f16 = fmul <2 x half> undef, undef + %v3f16 = fmul <3 x half> undef, undef + %v4f16 = fmul <4 x half> undef, undef ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll b/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll index 462a363bebfdc..0038f5b9fa3bd 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll @@ -1,102 +1,38 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s +; END. ; CHECK-LABEL: 'fneg_f32' -; CHECK: estimated cost of 0 for instruction: %fneg = fneg float -define amdgpu_kernel void @fneg_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) { - %vec = load float, float addrspace(1)* %vaddr - %fadd = fadd float %vec, undef - %fneg = fneg float %fadd - store float %fneg, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: 'fneg_v2f32' -; CHECK: estimated cost of 0 for instruction: %fneg = fneg <2 x float> -define amdgpu_kernel void @fneg_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) { - %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr - %fadd = fadd <2 x float> %vec, undef - %fneg = fneg <2 x float> %fadd - store <2 x float> %fneg, <2 x float> addrspace(1)* %out - ret void -} - -; CHECK-LABEL: 'fneg_v3f32' -; CHECK: estimated cost of 0 for instruction: %fneg = fneg <3 x float> -define amdgpu_kernel void @fneg_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) { - %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr - %fadd = fadd <3 x float> %vec, undef - %fneg = fneg <3 x float> %fadd - store <3 x float> %fneg, <3 x float> addrspace(1)* %out - ret void -} - -; CHECK-LABEL: 'fneg_v5f32' -; CHECK: estimated cost of 0 for instruction: %fneg = fneg <5 x float> -define amdgpu_kernel void @fneg_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) { - %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr - %fadd = fadd <5 x float> %vec, undef - %fneg = fneg <5 x float> %fadd - store <5 x float> %fneg, <5 x float> addrspace(1)* %out +; CHECK: estimated cost of 0 for {{.*}} fneg float +; CHECK: estimated cost of 0 for {{.*}} fneg <2 x float> +; CHECK: estimated cost of 0 for {{.*}} fneg <3 x float> +; CHECK: estimated cost of 0 for {{.*}} fneg <5 x float> +define amdgpu_kernel void @fneg_f32() { + %f32 = fneg float undef + %v2f32 = fneg <2 x float> undef + %v3f32 = fneg <3 x float> undef + %v5f32 = fneg <5 x float> undef ret void } ; CHECK-LABEL: 'fneg_f64' -; CHECK: estimated cost of 0 for instruction: %fneg = fneg double -define amdgpu_kernel void @fneg_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) { - %vec = load double, double addrspace(1)* %vaddr - %fadd = fadd double %vec, undef - %fneg = fneg double %fadd - store double %fneg, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: 'fneg_v2f64' -; CHECK: estimated cost of 0 for instruction: %fneg = fneg <2 x double> -define amdgpu_kernel void @fneg_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) { - %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr - %fadd = fadd <2 x double> %vec, undef - %fneg = fneg <2 x double> %fadd - store <2 x double> %fneg, <2 x double> addrspace(1)* %out - ret void -} - -; CHECK-LABEL: 'fneg_v3f64' -; CHECK: estimated cost of 0 for instruction: %fneg = fneg <3 x double> -define amdgpu_kernel void @fneg_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) { - %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr - %fadd = fadd <3 x double> %vec, undef - %fneg = fneg <3 x double> %fadd - store <3 x double> %fneg, <3 x double> addrspace(1)* %out +; CHECK: estimated cost of 0 for {{.*}} fneg double +; CHECK: estimated cost of 0 for {{.*}} fneg <2 x double> +; CHECK: estimated cost of 0 for {{.*}} fneg <3 x double> +define amdgpu_kernel void @fneg_f64() { + %f64 = fneg double undef + %v2f64 = fneg <2 x double> undef + %v3f64 = fneg <3 x double> undef ret void } ; CHECK-LABEL: 'fneg_f16' -; CHECK: estimated cost of 0 for instruction: %fneg = fneg half -define amdgpu_kernel void @fneg_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) { - %vec = load half, half addrspace(1)* %vaddr - %fadd = fadd half %vec, undef - %fneg = fneg half %fadd - store half %fneg, half addrspace(1)* %out - ret void -} - -; CHECK-LABEL: 'fneg_v2f16' -; CHECK: estimated cost of 0 for instruction: %fneg = fneg <2 x half> -define amdgpu_kernel void @fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) { - %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr - %fadd = fadd <2 x half> %vec, undef - %fneg = fneg <2 x half> %fadd - store <2 x half> %fneg, <2 x half> addrspace(1)* %out - ret void -} - -; CHECK-LABEL: 'fneg_v3f16' -; CHECK: estimated cost of 0 for instruction: %fneg = fneg <3 x half> -define amdgpu_kernel void @fneg_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) { - %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr - %fadd = fadd <3 x half> %vec, undef - %fneg = fneg <3 x half> %fadd - store <3 x half> %fneg, <3 x half> addrspace(1)* %out +; CHECK: estimated cost of 0 for {{.*}} fneg half +; CHECK: estimated cost of 0 for {{.*}} fneg <2 x half> +; CHECK: estimated cost of 0 for {{.*}} fneg <3 x half> +define amdgpu_kernel void @fneg_f16() { + %f16 = fneg half undef + %v2f16 = fneg <2 x half> undef + %v3f16 = fneg <3 x half> undef ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll index 287bba8f83b12..27d5a000ef5f8 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll @@ -2,40 +2,18 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s ; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZEALL,FASTF16,ALL %s ; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZEALL,SLOWF16,ALL %s +; END. ; ALL-LABEL: 'fsub_f32' ; ALL: estimated cost of 1 for {{.*}} fsub float -define amdgpu_kernel void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 { - %vec = load float, float addrspace(1)* %vaddr - %add = fsub float %vec, %b - store float %add, float addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fsub_v2f32' ; ALL: estimated cost of 2 for {{.*}} fsub <2 x float> -define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { - %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr - %add = fsub <2 x float> %vec, %b - store <2 x float> %add, <2 x float> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fsub_v3f32' ; ALL: estimated cost of 3 for {{.*}} fsub <3 x float> -define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { - %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr - %add = fsub <3 x float> %vec, %b - store <3 x float> %add, <3 x float> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fsub_v5f32' ; ALL: estimated cost of 5 for {{.*}} fsub <5 x float> -define amdgpu_kernel void @fsub_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { - %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr - %add = fsub <5 x float> %vec, %b - store <5 x float> %add, <5 x float> addrspace(1)* %out +define amdgpu_kernel void @fsub_f32() #0 { + %f32 = fsub float undef, undef + %v2f32 = fsub <2 x float> undef, undef + %v3f32 = fsub <3 x float> undef, undef + %v5f32 = fsub <5 x float> undef, undef ret void } @@ -43,70 +21,31 @@ define amdgpu_kernel void @fsub_v5f32(<5 x float> addrspace(1)* %out, <5 x float ; FASTF64: estimated cost of 2 for {{.*}} fsub double ; SLOWF64: estimated cost of 4 for {{.*}} fsub double ; SIZEALL: estimated cost of 2 for {{.*}} fsub double -define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 { - %vec = load double, double addrspace(1)* %vaddr - %add = fsub double %vec, %b - store double %add, double addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fsub_v2f64' ; FASTF64: estimated cost of 4 for {{.*}} fsub <2 x double> ; SLOWF64: estimated cost of 8 for {{.*}} fsub <2 x double> ; SIZEALL: estimated cost of 4 for {{.*}} fsub <2 x double> -define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 { - %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr - %add = fsub <2 x double> %vec, %b - store <2 x double> %add, <2 x double> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fsub_v3f64' ; FASTF64: estimated cost of 6 for {{.*}} fsub <3 x double> ; SLOWF64: estimated cost of 12 for {{.*}} fsub <3 x double> ; SIZEALL: estimated cost of 6 for {{.*}} fsub <3 x double> -define amdgpu_kernel void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 { - %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr - %add = fsub <3 x double> %vec, %b - store <3 x double> %add, <3 x double> addrspace(1)* %out +define amdgpu_kernel void @fsub_f64() #0 { + %f64 = fsub double undef, undef + %v2f64 = fsub <2 x double> undef, undef + %v3f64 = fsub <3 x double> undef, undef ret void } ; ALL-LABEL: 'fsub_f16' ; ALL: estimated cost of 1 for {{.*}} fsub half -define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 { - %vec = load half, half addrspace(1)* %vaddr - %add = fsub half %vec, %b - store half %add, half addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fsub_v2f16' ; SLOWF16: estimated cost of 2 for {{.*}} fsub <2 x half> ; FASTF16: estimated cost of 1 for {{.*}} fsub <2 x half> -define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 { - %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr - %add = fsub <2 x half> %vec, %b - store <2 x half> %add, <2 x half> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fsub_v3f16' ; SLOWF16: estimated cost of 4 for {{.*}} fsub <3 x half> ; FASTF16: estimated cost of 2 for {{.*}} fsub <3 x half> -define amdgpu_kernel void @fsub_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 { - %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr - %add = fsub <3 x half> %vec, %b - store <3 x half> %add, <3 x half> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'fsub_v4f16' ; SLOWF16: estimated cost of 4 for {{.*}} fsub <4 x half> ; FASTF16: estimated cost of 2 for {{.*}} fsub <4 x half> -define amdgpu_kernel void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 { - %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr - %add = fsub <4 x half> %vec, %b - store <4 x half> %add, <4 x half> addrspace(1)* %out +define amdgpu_kernel void @fsub_f16() #0 { + %f16 = fsub half undef, undef + %v2f16 = fsub <2 x half> undef, undef + %v3f16 = fsub <3 x half> undef, undef + %v4f16 = fsub <4 x half> undef, undef ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll b/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll index 5fbd7835351e7..52b745bbad3b3 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll @@ -6,167 +6,109 @@ ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,SZNOCONTRACT,SIZEALL,ALL %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,SIZEALL,ALL %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,SZNOCONTRACT,SIZEALL,ALL %s +; END. target triple = "amdgcn--" ; ALL-LABEL: 'fmul_fadd_f32': -; FUSED: estimated cost of 0 for instruction: %mul = fmul float -; SLOW: estimated cost of 1 for instruction: %mul = fmul float -; GFX1030: estimated cost of 1 for instruction: %mul = fmul float -; ALL: estimated cost of 1 for instruction: %add = fadd float -define float @fmul_fadd_f32(float %r0, float %r1, float %r2) #0 { - %mul = fmul float %r0, %r1 - %add = fadd float %mul, %r2 - ret float %add -} - -; ALL-LABEL: 'fmul_fadd_contract_f32': -; ALL: estimated cost of 0 for instruction: %mul = fmul contract float -; ALL: estimated cost of 1 for instruction: %add = fadd contract float -define float @fmul_fadd_contract_f32(float %r0, float %r1, float %r2) #0 { - %mul = fmul contract float %r0, %r1 - %add = fadd contract float %mul, %r2 - ret float %add -} - -; ALL-LABEL: 'fmul_fadd_v2f32': -; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x float> -; SLOW: estimated cost of 2 for instruction: %mul = fmul <2 x float> -; GFX1030: estimated cost of 2 for instruction: %mul = fmul <2 x float> -; ALL: estimated cost of 2 for instruction: %add = fadd <2 x float> -define <2 x float> @fmul_fadd_v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) #0 { - %mul = fmul <2 x float> %r0, %r1 - %add = fadd <2 x float> %mul, %r2 - ret <2 x float> %add -} - -; ALL-LABEL: 'fmul_fsub_f32': -; FUSED: estimated cost of 0 for instruction: %mul = fmul float -; SLOW: estimated cost of 1 for instruction: %mul = fmul float -; GFX1030: estimated cost of 1 for instruction: %mul = fmul float -; ALL: estimated cost of 1 for instruction: %sub = fsub float -define float @fmul_fsub_f32(float %r0, float %r1, float %r2) #0 { - %mul = fmul float %r0, %r1 - %sub = fsub float %mul, %r2 - ret float %sub -} - -; ALL-LABEL: 'fmul_fsub_v2f32': -; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x float> -; SLOW: estimated cost of 2 for instruction: %mul = fmul <2 x float> -; GFX1030: estimated cost of 2 for instruction: %mul = fmul <2 x float> -; ALL: estimated cost of 2 for instruction: %sub = fsub <2 x float> -define <2 x float> @fmul_fsub_v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) #0 { - %mul = fmul <2 x float> %r0, %r1 - %sub = fsub <2 x float> %mul, %r2 - ret <2 x float> %sub +; FUSED: estimated cost of 0 for {{.*}} fmul float +; SLOW: estimated cost of 1 for {{.*}} fmul float +; GFX1030: estimated cost of 1 for {{.*}} fmul float +; ALL: estimated cost of 1 for {{.*}} fadd float +; ALL: estimated cost of 0 for {{.*}} fmul contract float +; ALL: estimated cost of 1 for {{.*}} fadd contract float +; FUSED: estimated cost of 0 for {{.*}} fmul <2 x float> +; SLOW: estimated cost of 2 for {{.*}} fmul <2 x float> +; GFX1030: estimated cost of 2 for {{.*}} fmul <2 x float> +; ALL: estimated cost of 2 for {{.*}} fadd <2 x float> +; FUSED: estimated cost of 0 for {{.*}} fmul float +; SLOW: estimated cost of 1 for {{.*}} fmul float +; GFX1030: estimated cost of 1 for {{.*}} fmul float +; ALL: estimated cost of 1 for {{.*}} fsub float +; FUSED: estimated cost of 0 for {{.*}} fmul <2 x float> +; SLOW: estimated cost of 2 for {{.*}} fmul <2 x float> +; GFX1030: estimated cost of 2 for {{.*}} fmul <2 x float> +; ALL: estimated cost of 2 for {{.*}} fsub <2 x float> +define void @fmul_fadd_f32() #0 { + %f32 = fmul float undef, undef + %f32add = fadd float %f32, undef + %f32c = fmul contract float undef, undef + %f32cadd = fadd contract float %f32c, undef + %v2f32 = fmul <2 x float> undef, undef + %v2f32add = fadd <2 x float> %v2f32, undef + %f32_2 = fmul float undef, undef + %f32sub = fsub float %f32_2, undef + %v2f32_2 = fmul <2 x float> undef, undef + %v2f32sub = fsub <2 x float> %v2f32_2, undef + ret void } ; ALL-LABEL: 'fmul_fadd_f16': -; FUSED: estimated cost of 0 for instruction: %mul = fmul half -; SLOW: estimated cost of 1 for instruction: %mul = fmul half -; ALL: estimated cost of 1 for instruction: %add = fadd half -define half @fmul_fadd_f16(half %r0, half %r1, half %r2) #0 { - %mul = fmul half %r0, %r1 - %add = fadd half %mul, %r2 - ret half %add -} - -; ALL-LABEL: 'fmul_fadd_contract_f16': -; ALL: estimated cost of 0 for instruction: %mul = fmul contract half -; ALL: estimated cost of 1 for instruction: %add = fadd contract half -define half @fmul_fadd_contract_f16(half %r0, half %r1, half %r2) #0 { - %mul = fmul contract half %r0, %r1 - %add = fadd contract half %mul, %r2 - ret half %add -} - -; ALL-LABEL: 'fmul_fadd_v2f16': -; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x half> -; SLOW: estimated cost of 1 for instruction: %mul = fmul <2 x half> -; ALL: estimated cost of 1 for instruction: %add = fadd <2 x half> -define <2 x half> @fmul_fadd_v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2) #0 { - %mul = fmul <2 x half> %r0, %r1 - %add = fadd <2 x half> %mul, %r2 - ret <2 x half> %add -} - -; ALL-LABEL: 'fmul_fsub_f16': -; FUSED: estimated cost of 0 for instruction: %mul = fmul half -; SLOW: estimated cost of 1 for instruction: %mul = fmul half -; ALL: estimated cost of 1 for instruction: %sub = fsub half -define half @fmul_fsub_f16(half %r0, half %r1, half %r2) #0 { - %mul = fmul half %r0, %r1 - %sub = fsub half %mul, %r2 - ret half %sub -} - -; ALL-LABEL: 'fmul_fsub_v2f16': -; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x half> -; SLOW: estimated cost of 1 for instruction: %mul = fmul <2 x half> -; ALL: estimated cost of 1 for instruction: %sub = fsub <2 x half> -define <2 x half> @fmul_fsub_v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2) #0 { - %mul = fmul <2 x half> %r0, %r1 - %sub = fsub <2 x half> %mul, %r2 - ret <2 x half> %sub +; FUSED: estimated cost of 0 for {{.*}} fmul half +; SLOW: estimated cost of 1 for {{.*}} fmul half +; ALL: estimated cost of 1 for {{.*}} fadd half +; ALL: estimated cost of 0 for {{.*}} fmul contract half +; ALL: estimated cost of 1 for {{.*}} fadd contract half +; FUSED: estimated cost of 0 for {{.*}} fmul <2 x half> +; SLOW: estimated cost of 1 for {{.*}} fmul <2 x half> +; ALL: estimated cost of 1 for {{.*}} fadd <2 x half> +; FUSED: estimated cost of 0 for {{.*}} fmul half +; SLOW: estimated cost of 1 for {{.*}} fmul half +; ALL: estimated cost of 1 for {{.*}} fsub half +; FUSED: estimated cost of 0 for {{.*}} fmul <2 x half> +; SLOW: estimated cost of 1 for {{.*}} fmul <2 x half> +; ALL: estimated cost of 1 for {{.*}} fsub <2 x half> +define void @fmul_fadd_f16() #0 { + %f16 = fmul half undef, undef + %f16add = fadd half %f16, undef + %f16c = fmul contract half undef, undef + %f15cadd = fadd contract half %f16c, undef + %v2f16 = fmul <2 x half> undef, undef + %v2f16add = fadd <2 x half> %v2f16, undef + %f16_2 = fmul half undef, undef + %f16sub = fsub half %f16_2, undef + %v2f16_2 = fmul <2 x half> undef, undef + %v2f16sub = fsub <2 x half> %v2f16_2, undef + ret void } ; ALL-LABEL: 'fmul_fadd_f64': -; CONTRACT: estimated cost of 0 for instruction: %mul = fmul double -; NOCONTRACT: estimated cost of 4 for instruction: %mul = fmul double -; SZNOCONTRACT: estimated cost of 2 for instruction: %mul = fmul double -; THRPTALL: estimated cost of 4 for instruction: %add = fadd double -; SIZEALL: estimated cost of 2 for instruction: %add = fadd double -define double @fmul_fadd_f64(double %r0, double %r1, double %r2) #0 { - %mul = fmul double %r0, %r1 - %add = fadd double %mul, %r2 - ret double %add -} - -; ALL-LABEL: 'fmul_fadd_contract_f64': -; ALL: estimated cost of 0 for instruction: %mul = fmul contract double -; THRPTALL: estimated cost of 4 for instruction: %add = fadd contract double -; SIZEALL: estimated cost of 2 for instruction: %add = fadd contract double -define double @fmul_fadd_contract_f64(double %r0, double %r1, double %r2) #0 { - %mul = fmul contract double %r0, %r1 - %add = fadd contract double %mul, %r2 - ret double %add -} - -; ALL-LABEL: 'fmul_fadd_v2f64': -; CONTRACT: estimated cost of 0 for instruction: %mul = fmul <2 x double> -; NOCONTRACT: estimated cost of 8 for instruction: %mul = fmul <2 x double> -; SZNOCONTRACT: estimated cost of 4 for instruction: %mul = fmul <2 x double> -; THRPTALL: estimated cost of 8 for instruction: %add = fadd <2 x double> -; SIZEALL: estimated cost of 4 for instruction: %add = fadd <2 x double> -define <2 x double> @fmul_fadd_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 { - %mul = fmul <2 x double> %r0, %r1 - %add = fadd <2 x double> %mul, %r2 - ret <2 x double> %add -} - -; ALL-LABEL: 'fmul_fsub_f64': -; CONTRACT: estimated cost of 0 for instruction: %mul = fmul double -; NOCONTRACT: estimated cost of 4 for instruction: %mul = fmul double -; SZNOCONTRACT: estimated cost of 2 for instruction: %mul = fmul double -; THRPTALL: estimated cost of 4 for instruction: %sub = fsub double -; SIZEALL: estimated cost of 2 for instruction: %sub = fsub double -define double @fmul_fsub_f64(double %r0, double %r1, double %r2) #0 { - %mul = fmul double %r0, %r1 - %sub = fsub double %mul, %r2 - ret double %sub -} - -; ALL-LABEL: 'fmul_fsub_v2f64': -; CONTRACT: estimated cost of 0 for instruction: %mul = fmul <2 x double> -; NOCONTRACT: estimated cost of 8 for instruction: %mul = fmul <2 x double> -; SZNOCONTRACT: estimated cost of 4 for instruction: %mul = fmul <2 x double> -; THRPTALL: estimated cost of 8 for instruction: %sub = fsub <2 x double> -; SIZEALL: estimated cost of 4 for instruction: %sub = fsub <2 x double> -define <2 x double> @fmul_fsub_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 { - %mul = fmul <2 x double> %r0, %r1 - %sub = fsub <2 x double> %mul, %r2 - ret <2 x double> %sub +; CONTRACT: estimated cost of 0 for {{.*}} fmul double +; NOCONTRACT: estimated cost of 4 for {{.*}} fmul double +; SZNOCONTRACT: estimated cost of 2 for {{.*}} fmul double +; THRPTALL: estimated cost of 4 for {{.*}} fadd double +; SIZEALL: estimated cost of 2 for {{.*}} fadd double +; ALL: estimated cost of 0 for {{.*}} fmul contract double +; THRPTALL: estimated cost of 4 for {{.*}} fadd contract double +; SIZEALL: estimated cost of 2 for {{.*}} fadd contract double +; CONTRACT: estimated cost of 0 for {{.*}} fmul <2 x double> +; NOCONTRACT: estimated cost of 8 for {{.*}} fmul <2 x double> +; SZNOCONTRACT: estimated cost of 4 for {{.*}} fmul <2 x double> +; THRPTALL: estimated cost of 8 for {{.*}} fadd <2 x double> +; SIZEALL: estimated cost of 4 for {{.*}} fadd <2 x double> +; CONTRACT: estimated cost of 0 for {{.*}} fmul double +; NOCONTRACT: estimated cost of 4 for {{.*}} fmul double +; SZNOCONTRACT: estimated cost of 2 for {{.*}} fmul double +; THRPTALL: estimated cost of 4 for {{.*}} fsub double +; SIZEALL: estimated cost of 2 for {{.*}} fsub double +; CONTRACT: estimated cost of 0 for {{.*}} fmul <2 x double> +; NOCONTRACT: estimated cost of 8 for {{.*}} fmul <2 x double> +; SZNOCONTRACT: estimated cost of 4 for {{.*}} fmul <2 x double> +; THRPTALL: estimated cost of 8 for {{.*}} fsub <2 x double> +; SIZEALL: estimated cost of 4 for {{.*}} fsub <2 x double> +define void @fmul_fadd_f64() #0 { + %f64 = fmul double undef, undef + %f64add = fadd double %f64, undef + %f64c = fmul contract double undef, undef + %f64cadd = fadd contract double %f64c, undef + %v2f64 = fmul <2 x double> undef, undef + %v2f64add = fadd <2 x double> %v2f64, undef + %f64_2 = fmul double undef, undef + %f64sub = fsub double %f64_2, undef + %v2f64_2 = fmul <2 x double> undef, undef + %v2f64sub = fsub <2 x double> %v2f64_2, undef + ret void } attributes #0 = { nounwind } + diff --git a/llvm/test/Analysis/CostModel/AMDGPU/insertelement.ll b/llvm/test/Analysis/CostModel/AMDGPU/insertelement.ll index 7bd86db270aaa..a7d28413319bf 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/insertelement.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/insertelement.ll @@ -4,49 +4,20 @@ ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa %s | FileCheck -check-prefixes=GCN,CI %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=GCN,GFX89 %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GCN,GFX89 %s +; END. -; GCN-LABEL: 'insertelement_v2i32' +; GCN-LABEL: 'insertelement_v2' ; GCN: estimated cost of 0 for {{.*}} insertelement <2 x i32> -define amdgpu_kernel void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) { - %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr - %insert = insertelement <2 x i32> %vec, i32 123, i32 1 - store <2 x i32> %insert, <2 x i32> addrspace(1)* %out - ret void -} - -; GCN-LABEL: 'insertelement_v2i64' ; GCN: estimated cost of 0 for {{.*}} insertelement <2 x i64> -define amdgpu_kernel void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) { - %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr - %insert = insertelement <2 x i64> %vec, i64 123, i64 1 - store <2 x i64> %insert, <2 x i64> addrspace(1)* %out - ret void -} - -; GCN-LABEL: 'insertelement_0_v2i16' ; CI: estimated cost of 1 for {{.*}} insertelement <2 x i16> ; GFX89: estimated cost of 0 for {{.*}} insertelement <2 x i16> -define amdgpu_kernel void @insertelement_0_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { - %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr - %insert = insertelement <2 x i16> %vec, i16 123, i16 0 - store <2 x i16> %insert, <2 x i16> addrspace(1)* %out - ret void -} - -; GCN-LABEL: 'insertelement_1_v2i16' ; GCN: estimated cost of 1 for {{.*}} insertelement <2 x i16> -define amdgpu_kernel void @insertelement_1_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { - %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr - %insert = insertelement <2 x i16> %vec, i16 123, i16 1 - store <2 x i16> %insert, <2 x i16> addrspace(1)* %out - ret void -} - -; GCN-LABEL: 'insertelement_1_v2i8' ; GCN: estimated cost of 1 for {{.*}} insertelement <2 x i8> -define amdgpu_kernel void @insertelement_1_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) { - %vec = load <2 x i8>, <2 x i8> addrspace(1)* %vaddr - %insert = insertelement <2 x i8> %vec, i8 123, i8 1 - store <2 x i8> %insert, <2 x i8> addrspace(1)* %out +define amdgpu_kernel void @insertelement_v2() { + %v2i32_1 = insertelement <2 x i32> undef, i32 123, i32 1 + %v2i64_1 = insertelement <2 x i64> undef, i64 123, i64 1 + %v2i16_0 = insertelement <2 x i16> undef, i16 123, i16 0 + %v2i16_1 = insertelement <2 x i16> undef, i16 123, i16 1 + %v2i8_1 = insertelement <2 x i8> undef, i8 123, i8 1 ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/logicalop.ll b/llvm/test/Analysis/CostModel/AMDGPU/logicalop.ll index 21e23d73cdd10..0eedf867502eb 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/logicalop.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/logicalop.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s --check-prefix=CHECK-THROUGHPUT ; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s --check-prefix=CHECK-SIZE +; END. define amdgpu_kernel void @op() { ; Logical and/or - select's cost must be equivalent to that of binop diff --git a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll index e4ca0685708f7..33109ff18a2c6 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll @@ -2,139 +2,63 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,THRPTALL,ALL %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SIZESLOW16,SIZEALL,ALL %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=SIZEFAST16,SIZEALL,ALL %s +; END. ; ALL-LABEL: 'mul_i32' ; THRPTALL: estimated cost of 4 for {{.*}} mul i32 ; SIZEALL: estimated cost of 2 for {{.*}} mul i32 -define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { - %vec = load i32, i32 addrspace(1)* %vaddr - %mul = mul i32 %vec, %b - store i32 %mul, i32 addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'mul_v2i32' ; THRPTALL: estimated cost of 8 for {{.*}} mul <2 x i32> ; SIZEALL: estimated cost of 4 for {{.*}} mul <2 x i32> -define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 { - %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr - %mul = mul <2 x i32> %vec, %b - store <2 x i32> %mul, <2 x i32> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'mul_v3i32' ; THRPTALL: estimated cost of 12 for {{.*}} mul <3 x i32> ; SIZEALL: estimated cost of 6 for {{.*}} mul <3 x i32> -define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 { - %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr - %mul = mul <3 x i32> %vec, %b - store <3 x i32> %mul, <3 x i32> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'mul_v5i32' -; THRPTALL: estimated cost of 20 for {{.*}} mul <5 x i32> -; SIZEALL: estimated cost of 10 for {{.*}} mul <5 x i32> -define amdgpu_kernel void @mul_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 { - %vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr - %mul = mul <5 x i32> %vec, %b - store <5 x i32> %mul, <5 x i32> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'mul_v4i32' ; THRPTALL: estimated cost of 16 for {{.*}} mul <4 x i32> ; SIZEALL: estimated cost of 8 for {{.*}} mul <4 x i32> -define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 { - %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr - %mul = mul <4 x i32> %vec, %b - store <4 x i32> %mul, <4 x i32> addrspace(1)* %out +; THRPTALL: estimated cost of 20 for {{.*}} mul <5 x i32> +; SIZEALL: estimated cost of 10 for {{.*}} mul <5 x i32> +define amdgpu_kernel void @mul_i32() #0 { + %i32 = mul i32 undef, undef + %v2i32 = mul <2 x i32> undef, undef + %v3i32 = mul <3 x i32> undef, undef + %v4i32 = mul <4 x i32> undef, undef + %v5i32 = mul <5 x i32> undef, undef ret void } ; ALL-LABEL: 'mul_i64' ; THRPTALL: estimated cost of 20 for {{.*}} mul i64 ; SIZEALL: estimated cost of 12 for {{.*}} mul i64 -define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { - %vec = load i64, i64 addrspace(1)* %vaddr - %mul = mul i64 %vec, %b - store i64 %mul, i64 addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'mul_v2i64' ; THRPTALL: estimated cost of 40 for {{.*}} mul <2 x i64> ; SIZEALL: estimated cost of 24 for {{.*}} mul <2 x i64> -define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 { - %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr - %mul = mul <2 x i64> %vec, %b - store <2 x i64> %mul, <2 x i64> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'mul_v3i64' ; THRPTALL: estimated cost of 60 for {{.*}} mul <3 x i64> ; SIZEALL: estimated cost of 36 for {{.*}} mul <3 x i64> -define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 { - %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr - %mul = mul <3 x i64> %vec, %b - store <3 x i64> %mul, <3 x i64> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'mul_v4i64' ; THRPTALL: estimated cost of 80 for {{.*}} mul <4 x i64> ; SIZEALL: estimated cost of 48 for {{.*}} mul <4 x i64> -define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 { - %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr - %mul = mul <4 x i64> %vec, %b - store <4 x i64> %mul, <4 x i64> addrspace(1)* %out - ret void -} - - -; ALL-LABEL: 'mul_v8i64' ; THRPTALL: estimated cost of 320 for {{.*}} mul <8 x i64> ; SIZEALL: estimated cost of 192 for {{.*}} mul <8 x i64> -define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 { - %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr - %mul = mul <8 x i64> %vec, %b - store <8 x i64> %mul, <8 x i64> addrspace(1)* %out +define amdgpu_kernel void @mul_i64() #0 { + %i64 = mul i64 undef, undef + %v2i64 = mul <2 x i64> undef, undef + %v3i64 = mul <3 x i64> undef, undef + %v4i64 = mul <4 x i64> undef, undef + %v8i64 = mul <8 x i64> undef, undef ret void } ; ALL-LABEL: 'mul_i16' ; THRPTALL: estimated cost of 4 for {{.*}} mul i16 ; SIZEALL: estimated cost of 2 for {{.*}} mul i16 -define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 { - %vec = load i16, i16 addrspace(1)* %vaddr - %mul = mul i16 %vec, %b - store i16 %mul, i16 addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'mul_v2i16' ; SLOW16: estimated cost of 8 for {{.*}} mul <2 x i16> ; FAST16: estimated cost of 4 for {{.*}} mul <2 x i16> ; SIZESLOW16: estimated cost of 4 for {{.*}} mul <2 x i16> ; SIZEFAST16: estimated cost of 2 for {{.*}} mul <2 x i16> -define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr - %mul = mul <2 x i16> %vec, %b - store <2 x i16> %mul, <2 x i16> addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'mul_v3i16' ; SLOW16: estimated cost of 16 for {{.*}} mul <3 x i16> ; FAST16: estimated cost of 8 for {{.*}} mul <3 x i16> ; SIZESLOW16: estimated cost of 8 for {{.*}} mul <3 x i16> ; SIZEFAST16: estimated cost of 4 for {{.*}} mul <3 x i16> -define amdgpu_kernel void @mul_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %vaddr, <3 x i16> %b) #0 { - %vec = load <3 x i16>, <3 x i16> addrspace(1)* %vaddr - %mul = mul <3 x i16> %vec, %b - store <3 x i16> %mul, <3 x i16> addrspace(1)* %out +define amdgpu_kernel void @mul_i16() #0 { + %i16 = mul i16 undef, undef + %v2i16 = mul <2 x i16> undef, undef + %v3i16 = mul <3 x i16> undef, undef ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/reduce-and.ll b/llvm/test/Analysis/CostModel/AMDGPU/reduce-and.ll index 07592b1f8d4c4..6357408e89148 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/reduce-and.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/reduce-and.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -cost-model -cost-kind=throughput -analyze | FileCheck %s +; END. define i32 @reduce_i1(i32 %arg) { ; CHECK-LABEL: 'reduce_i1' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/reduce-or.ll b/llvm/test/Analysis/CostModel/AMDGPU/reduce-or.ll index c78c115fe6b8c..906ead77c092c 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/reduce-or.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/reduce-or.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -cost-model -cost-kind=throughput -analyze | FileCheck %s +; END. define i32 @reduce_i1(i32 %arg) { ; CHECK-LABEL: 'reduce_i1' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll index 42936644e590b..f67a0fae8e127 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll @@ -2,120 +2,52 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOW64,SLOW16 %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,FAST16 %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SLOW16 %s +; END. -; ALL-LABEL: 'shl_i32' +; ALL-LABEL: 'shl' ; ALL: estimated cost of 1 for {{.*}} shl i32 -define amdgpu_kernel void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { - %vec = load i32, i32 addrspace(1)* %vaddr - %or = shl i32 %vec, %b - store i32 %or, i32 addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'shl_i64' ; FAST64: estimated cost of 2 for {{.*}} shl i64 ; SLOW64: estimated cost of 4 for {{.*}} shl i64 ; SIZEALL: estimated cost of 2 for {{.*}} shl i64 -define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { - %vec = load i64, i64 addrspace(1)* %vaddr - %or = shl i64 %vec, %b - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'shl_i16' ; ALL: estimated cost of 1 for {{.*}} shl i16 -define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 { - %vec = load i16, i16 addrspace(1)* %vaddr - %or = shl i16 %vec, %b - store i16 %or, i16 addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'shl_v2i16' ; SLOW16: estimated cost of 2 for {{.*}} shl <2 x i16> ; FAST16: estimated cost of 1 for {{.*}} shl <2 x i16> -define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr - %or = shl <2 x i16> %vec, %b - store <2 x i16> %or, <2 x i16> addrspace(1)* %out +define amdgpu_kernel void @shl() #0 { + %i32 = shl i32 undef, undef + %i64 = shl i64 undef, undef + %i16 = shl i16 undef, undef + %v2i16 = shl <2 x i16> undef, undef ret void } -; ALL-LABEL: 'lshr_i32' +; ALL-LABEL: 'lshr' ; ALL: estimated cost of 1 for {{.*}} lshr i32 -define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { - %vec = load i32, i32 addrspace(1)* %vaddr - %or = lshr i32 %vec, %b - store i32 %or, i32 addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'lshr_i64' ; FAST64: estimated cost of 2 for {{.*}} lshr i64 ; SLOW64: estimated cost of 4 for {{.*}} lshr i64 ; SIZEALL: estimated cost of 2 for {{.*}} lshr i64 -define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { - %vec = load i64, i64 addrspace(1)* %vaddr - %or = lshr i64 %vec, %b - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'lshr_i16' ; ALL: estimated cost of 1 for {{.*}} lshr i16 -define amdgpu_kernel void @lshr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 { - %vec = load i16, i16 addrspace(1)* %vaddr - %or = lshr i16 %vec, %b - store i16 %or, i16 addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'lshr_v2i16' ; SLOW16: estimated cost of 2 for {{.*}} lshr <2 x i16> ; FAST16: estimated cost of 1 for {{.*}} lshr <2 x i16> -define amdgpu_kernel void @lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr - %or = lshr <2 x i16> %vec, %b - store <2 x i16> %or, <2 x i16> addrspace(1)* %out +define amdgpu_kernel void @lshr() #0 { + %i32 = lshr i32 undef, undef + %i64 = lshr i64 undef, undef + %i16 = lshr i16 undef, undef + %v2i16 = lshr <2 x i16> undef, undef ret void } -; ALL-LABEL: 'ashr_i32' +; ALL-LABEL: 'ashr' ; ALL: estimated cost of 1 for {{.*}} ashr i32 -define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 { - %vec = load i32, i32 addrspace(1)* %vaddr - %or = ashr i32 %vec, %b - store i32 %or, i32 addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'ashr_i64' ; FAST64: estimated cost of 2 for {{.*}} ashr i64 ; SLOW64: estimated cost of 4 for {{.*}} ashr i64 -define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 { - %vec = load i64, i64 addrspace(1)* %vaddr - %or = ashr i64 %vec, %b - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'ashr_i16' ; ALL: estimated cost of 1 for {{.*}} ashr i16 -define amdgpu_kernel void @ashr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 { - %vec = load i16, i16 addrspace(1)* %vaddr - %or = ashr i16 %vec, %b - store i16 %or, i16 addrspace(1)* %out - ret void -} - -; ALL-LABEL: 'ashr_v2i16' ; SLOW16: estimated cost of 2 for {{.*}} ashr <2 x i16> ; FAST16: estimated cost of 1 for {{.*}} ashr <2 x i16> -define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr - %or = ashr <2 x i16> %vec, %b - store <2 x i16> %or, <2 x i16> addrspace(1)* %out +define amdgpu_kernel void @ashr() #0 { + %i32 = ashr i32 undef, undef + %i64 = ashr i64 undef, undef + %i16 = ashr i16 undef, undef + %v2i16 = ashr <2 x i16> undef, undef ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll index 7fe1cebd879f0..cdbd9ec570abe 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll @@ -5,6 +5,7 @@ ; RUN: opt < %s -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -cost-kind=code-size -S | FileCheck -check-prefixes=GFX9-CS %s ; RUN: opt < %s -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji -S | FileCheck -check-prefixes=VI %s ; RUN: opt < %s -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji -cost-kind=code-size -S | FileCheck -check-prefixes=VI-CS %s +; END. define amdgpu_kernel void @shufflevector_00_v2i16(<2 x i16> %vec0, <2 x i16> %vec1) { ; GFX10-LABEL: 'shufflevector_00_v2i16'