diff --git a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
index 609769fd51488..1328025f1c3c2 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
@@ -2,154 +2,63 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOW16,ALL %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST16,ALL %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOW16,ALL %s
+; END.
 
-
-; ALL: 'add_i32'
+; ALL-LABEL: 'add_i32'
 ; ALL: estimated cost of 1 for {{.*}} add i32
-define amdgpu_kernel void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
-  %vec = load i32, i32 addrspace(1)* %vaddr
-  %add = add i32 %vec, %b
-  store i32 %add, i32 addrspace(1)* %out
-  ret void
-}
-
-; ALL: 'add_v2i32'
 ; ALL: estimated cost of 2 for {{.*}} add <2 x i32>
-define amdgpu_kernel void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
-  %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
-  %add = add <2 x i32> %vec, %b
-  store <2 x i32> %add, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; ALL: 'add_v3i32'
-; Allow for 4 when v3i32 is illegal and TargetLowering thinks it needs widening,
-; and 3 when it is legal.
+;;; Allow for 4 when v3i32 is illegal and TargetLowering thinks it needs widening,
+;;; and 3 when it is legal.
 ; ALL: estimated cost of {{[34]}} for {{.*}} add <3 x i32>
-define amdgpu_kernel void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
-  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
-  %add = add <3 x i32> %vec, %b
-  store <3 x i32> %add, <3 x i32> addrspace(1)* %out
-  ret void
-}
-
-; ALL: 'add_v4i32'
 ; ALL: estimated cost of 4 for {{.*}} add <4 x i32>
-define amdgpu_kernel void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
-  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
-  %add = add <4 x i32> %vec, %b
-  store <4 x i32> %add, <4 x i32> addrspace(1)* %out
-  ret void
-}
-
-; ALL: 'add_v5i32'
-; Allow for 8 when v3i32 is illegal and TargetLowering thinks it needs widening,
-; and 5 when it is legal.
+;;; Allow for 8 when v3i32 is illegal and TargetLowering thinks it needs widening,
+;;; and 5 when it is legal.
 ; ALL: estimated cost of {{[58]}} for {{.*}} add <5 x i32>
-define amdgpu_kernel void @add_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 {
-  %vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr
-  %add = add <5 x i32> %vec, %b
-  store <5 x i32> %add, <5 x i32> addrspace(1)* %out
+define amdgpu_kernel void @add_i32() #0 {
+  %i32 = add i32 undef, undef
+  %v2i32 = add <2 x i32> undef, undef
+  %v3i32 = add <3 x i32> undef, undef
+  %v4i32 = add <4 x i32> undef, undef
+  %v5i32 = add <5 x i32> undef, undef
   ret void
 }
 
-; ALL: 'add_i64'
+; ALL-LABEL: 'add_i64'
 ; ALL: estimated cost of 2 for {{.*}} add i64
-define amdgpu_kernel void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
-  %vec = load i64, i64 addrspace(1)* %vaddr
-  %add = add i64 %vec, %b
-  store i64 %add, i64 addrspace(1)* %out
-  ret void
-}
-
-; ALL: 'add_v2i64'
 ; ALL: estimated cost of 4 for {{.*}} add <2 x i64>
-define amdgpu_kernel void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
-  %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
-  %add = add <2 x i64> %vec, %b
-  store <2 x i64> %add, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; ALL: 'add_v3i64'
 ; ALL: estimated cost of 6 for {{.*}} add <3 x i64>
-define amdgpu_kernel void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
-  %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
-  %add = add <3 x i64> %vec, %b
-  store <3 x i64> %add, <3 x i64> addrspace(1)* %out
-  ret void
-}
-
-; ALL: 'add_v4i64'
 ; ALL: estimated cost of 8 for {{.*}} add <4 x i64>
-define amdgpu_kernel void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
-  %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
-  %add = add <4 x i64> %vec, %b
-  store <4 x i64> %add, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-; ALL: 'add_v16i64'
 ; ALL: estimated cost of 128 for {{.*}} add <16 x i64>
-define amdgpu_kernel void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 {
-  %vec = load <16 x i64>, <16 x i64> addrspace(1)* %vaddr
-  %add = add <16 x i64> %vec, %b
-  store <16 x i64> %add, <16 x i64> addrspace(1)* %out
+define amdgpu_kernel void @add_i64() #0 {
+  %i64 = add i64 undef, undef
+  %v2i64 = add <2 x i64> undef, undef
+  %v3i64 = add <3 x i64> undef, undef
+  %v4i64 = add <4 x i64> undef, undef
+  %v16i64 = add <16 x i64> undef, undef
   ret void
 }
 
-; ALL: 'add_i16'
+; ALL-LABEL: 'add_i16'
 ; ALL: estimated cost of 1 for {{.*}} add i16
-define amdgpu_kernel void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
-  %vec = load i16, i16 addrspace(1)* %vaddr
-  %add = add i16 %vec, %b
-  store i16 %add, i16 addrspace(1)* %out
-  ret void
-}
-
-; ALL: 'add_v2i16'
 ; SLOW16: estimated cost of 2 for {{.*}} add <2 x i16>
 ; FAST16: estimated cost of 1 for {{.*}} add <2 x i16>
-define amdgpu_kernel void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
-  %add = add <2 x i16> %vec, %b
-  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+define amdgpu_kernel void @add_i16() #0 {
+  %i16 = add i16 undef, undef
+  %v2i16 = add <2 x i16> undef, undef
   ret void
 }
 
-; ALL: 'sub_i32'
+; ALL-LABEL: 'sub'
 ; ALL: estimated cost of 1 for {{.*}} sub i32
-define amdgpu_kernel void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
-  %vec = load i32, i32 addrspace(1)* %vaddr
-  %sub = sub i32 %vec, %b
-  store i32 %sub, i32 addrspace(1)* %out
-  ret void
-}
-
-; ALL: 'sub_i64'
 ; ALL: estimated cost of 2 for {{.*}} sub i64
-define amdgpu_kernel void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
-  %vec = load i64, i64 addrspace(1)* %vaddr
-  %sub = sub i64 %vec, %b
-  store i64 %sub, i64 addrspace(1)* %out
-  ret void
-}
-; ALL: 'sub_i16'
 ; ALL: estimated cost of 1 for {{.*}} sub i16
-define amdgpu_kernel void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
-  %vec = load i16, i16 addrspace(1)* %vaddr
-  %sub = sub i16 %vec, %b
-  store i16 %sub, i16 addrspace(1)* %out
-  ret void
-}
-
-; ALL: 'sub_v2i16'
 ; SLOW16: estimated cost of 2 for {{.*}} sub <2 x i16>
 ; FAST16: estimated cost of 1 for {{.*}} sub <2 x i16>
-define amdgpu_kernel void @sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
-  %sub = sub <2 x i16> %vec, %b
-  store <2 x i16> %sub, <2 x i16> addrspace(1)* %out
+define amdgpu_kernel void @sub() #0 {
+  %i32 = sub i32 undef, undef
+  %i64 = sub i64 undef, undef
+  %i16 = sub i16 undef, undef
+  %v2i16 = sub <2 x i16> undef, undef
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/addrspacecast.ll b/llvm/test/Analysis/CostModel/AMDGPU/addrspacecast.ll
index a87a965c6bfd0..8ca13eed2f43f 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/addrspacecast.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/addrspacecast.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck %s
+; END.
 
 ; CHECK-LABEL: 'addrspacecast_global_to_flat'
 ; CHECK: estimated cost of 0 for {{.*}} addrspacecast i8 addrspace(1)* %ptr to i8*
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll b/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll
index 2dec5f350936d..63f7ab74e2006 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll
@@ -2,88 +2,41 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,FAST16 %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL,SLOW16 %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,FAST16 %s
+; END.
 
-; ALL: 'or_i32'
+; ALL-LABEL: 'or'
 ; ALL: estimated cost of 1 for {{.*}} or i32
-define amdgpu_kernel void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
-  %vec = load i32, i32 addrspace(1)* %vaddr
-  %or = or i32 %vec, %b
-  store i32 %or, i32 addrspace(1)* %out
-  ret void
-}
-
-; ALL: 'or_i64'
 ; ALL: estimated cost of 2 for {{.*}} or i64
-define amdgpu_kernel void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
-  %vec = load i64, i64 addrspace(1)* %vaddr
-  %or = or i64 %vec, %b
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; ALL: 'or_v2i16'
 ; SLOW16: estimated cost of 2 for {{.*}} or <2 x i16>
 ; FAST16: estimated cost of 1 for {{.*}} or <2 x i16>
-define amdgpu_kernel void @or_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
-  %or = or <2 x i16> %vec, %b
-  store <2 x i16> %or, <2 x i16> addrspace(1)* %out
+define amdgpu_kernel void @or() #0 {
+  %i32 = or i32 undef, undef
+  %i64 = or i64 undef, undef
+  %v2i16 = or <2 x i16> undef, undef
   ret void
 }
 
-; ALL: 'xor_i32'
+; ALL-LABEL: 'xor'
 ; ALL: estimated cost of 1 for {{.*}} xor i32
-define amdgpu_kernel void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
-  %vec = load i32, i32 addrspace(1)* %vaddr
-  %or = xor i32 %vec, %b
-  store i32 %or, i32 addrspace(1)* %out
-  ret void
-}
-
-; ALL: 'xor_i64'
 ; ALL: estimated cost of 2 for {{.*}} xor i64
-define amdgpu_kernel void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
-  %vec = load i64, i64 addrspace(1)* %vaddr
-  %or = xor i64 %vec, %b
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; ALL: 'xor_v2i16'
 ; SLOW16: estimated cost of 2 for {{.*}} xor <2 x i16>
 ; FAST16: estimated cost of 1 for {{.*}} xor <2 x i16>
-define amdgpu_kernel void @xor_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
-  %xor = xor <2 x i16> %vec, %b
-  store <2 x i16> %xor, <2 x i16> addrspace(1)* %out
+define amdgpu_kernel void @xor() #0 {
+  %i32 = xor i32 undef, undef
+  %i64 = xor i64 undef, undef
+  %v2i16 = xor <2 x i16> undef, undef
   ret void
 }
 
-; ALL: 'and_i32'
+; ALL-LABEL: 'and'
 ; ALL: estimated cost of 1 for {{.*}} and i32
-define amdgpu_kernel void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
-  %vec = load i32, i32 addrspace(1)* %vaddr
-  %or = and i32 %vec, %b
-  store i32 %or, i32 addrspace(1)* %out
-  ret void
-}
-
-; ALL: 'and_i64'
 ; ALL: estimated cost of 2 for {{.*}} and i64
-define amdgpu_kernel void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
-  %vec = load i64, i64 addrspace(1)* %vaddr
-  %or = and i64 %vec, %b
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; ALL: 'and_v2i16'
 ; SLOW16: estimated cost of 2 for {{.*}} and <2 x i16>
 ; FAST16: estimated cost of 1 for {{.*}} and <2 x i16>
-define amdgpu_kernel void @and_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
-  %and = and <2 x i16> %vec, %b
-  store <2 x i16> %and, <2 x i16> addrspace(1)* %out
+define amdgpu_kernel void @and() #0 {
+  %i32 = and i32 undef, undef
+  %i64 = and i64 undef, undef
+  %v2i16 = and <2 x i16> undef, undef
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/control-flow.ll b/llvm/test/Analysis/CostModel/AMDGPU/control-flow.ll
index 88c4956f3fc0e..dbe55ff7bfc86 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/control-flow.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/control-flow.ll
@@ -1,15 +1,16 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck  --check-prefixes=ALL,SPEED %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck --check-prefixes=ALL,SIZE %s
+; END.
 
 ; ALL-LABEL: 'test_br_cost'
-; SPEED: estimated cost of 7 for instruction: br i1
+; SPEED-NEXT: estimated cost of 7 for instruction: br i1
 ; SPEED: estimated cost of 4 for instruction: br label
-; SPEED: estimated cost of 1 for instruction: %phi = phi i32 [
-; SPEED: estimated cost of 10 for instruction: ret void
-; SIZE: estimated cost of 5 for instruction: br i1
+; SPEED-NEXT: estimated cost of 1 for instruction: %phi = phi i32 [
+; SPEED-NEXT: estimated cost of 10 for instruction: ret void
+; SIZE-NEXT: estimated cost of 5 for instruction: br i1
 ; SIZE: estimated cost of 1 for instruction: br label
-; SIZE: estimated cost of 0 for instruction: %phi = phi i32 [
-; SIZE: estimated cost of 1 for instruction: ret void
+; SIZE-NEXT: estimated cost of 0 for instruction: %phi = phi i32 [
+; SIZE-NEXT: estimated cost of 1 for instruction: ret void
 define amdgpu_kernel void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
 bb0:
   br i1 undef, label %bb1, label %bb2
@@ -26,8 +27,8 @@ bb2:
 }
 
 ; ALL-LABEL: 'test_switch_cost'
-; SPEED: estimated cost of 24 for instruction:   switch
-; SIZE: estimated cost of 18 for instruction:   switch
+; SPEED-NEXT: estimated cost of 24 for instruction:   switch
+; SIZE-NEXT: estimated cost of 18 for instruction:   switch
 define amdgpu_kernel void @test_switch_cost(i32 %a) #0 {
 entry:
   switch i32 %a, label %default [
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/extractelement.ll b/llvm/test/Analysis/CostModel/AMDGPU/extractelement.ll
index 67ce8ffba936e..1af8d862732c3 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/extractelement.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/extractelement.ll
@@ -4,141 +4,55 @@
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa %s | FileCheck -check-prefixes=GCN,CI %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=GCN,GFX89 %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GCN,GFX89 %s
-
-
-; GCN: 'extractelement_v2i32'
-; GCN: estimated cost of 0 for {{.*}} extractelement <2 x i32>
-define amdgpu_kernel void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
-  %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
-  %elt = extractelement <2 x i32> %vec, i32 1
-  store i32 %elt, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN: 'extractelement_v2f32'
-; GCN: estimated cost of 0 for {{.*}} extractelement <2 x float>
-define amdgpu_kernel void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
-  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
-  %elt = extractelement <2 x float> %vec, i32 1
-  store float %elt, float addrspace(1)* %out
-  ret void
-}
-
-; GCN: 'extractelement_v3i32'
-; GCN: estimated cost of 0 for {{.*}} extractelement <3 x i32>
-define amdgpu_kernel void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
-  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
-  %elt = extractelement <3 x i32> %vec, i32 1
-  store i32 %elt, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN: 'extractelement_v4i32'
-; GCN: estimated cost of 0 for {{.*}} extractelement <4 x i32>
-define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
-  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
-  %elt = extractelement <4 x i32> %vec, i32 1
-  store i32 %elt, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN: 'extractelement_v5i32'
-; GCN: estimated cost of 0 for {{.*}} extractelement <5 x i32>
-define amdgpu_kernel void @extractelement_v5i32(i32 addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr) {
-  %vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr
-  %elt = extractelement <5 x i32> %vec, i32 1
-  store i32 %elt, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN: 'extractelement_v8i32'
-; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i32>
-define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
-  %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
-  %elt = extractelement <8 x i32> %vec, i32 1
-  store i32 %elt, i32 addrspace(1)* %out
-  ret void
-}
-
-; FIXME: Should be non-0
-; GCN: 'extractelement_v8i32_dynindex'
-; GCN: estimated cost of 2 for {{.*}} extractelement <8 x i32>
-define amdgpu_kernel void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
-  %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
-  %elt = extractelement <8 x i32> %vec, i32 %idx
-  store i32 %elt, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN: 'extractelement_v2i64'
-; GCN: estimated cost of 0 for {{.*}} extractelement <2 x i64>
-define amdgpu_kernel void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
-  %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
-  %elt = extractelement <2 x i64> %vec, i64 1
-  store i64 %elt, i64 addrspace(1)* %out
-  ret void
-}
-
-; GCN: 'extractelement_v3i64'
-; GCN: estimated cost of 0 for {{.*}} extractelement <3 x i64>
-define amdgpu_kernel void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
-  %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
-  %elt = extractelement <3 x i64> %vec, i64 1
-  store i64 %elt, i64 addrspace(1)* %out
-  ret void
-}
-
-; GCN: 'extractelement_v4i64'
-; GCN: estimated cost of 0 for {{.*}} extractelement <4 x i64>
-define amdgpu_kernel void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
-  %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
-  %elt = extractelement <4 x i64> %vec, i64 1
-  store i64 %elt, i64 addrspace(1)* %out
-  ret void
-}
-
-; GCN: 'extractelement_v8i64'
-; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i64>
-define amdgpu_kernel void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
-  %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
-  %elt = extractelement <8 x i64> %vec, i64 1
-  store i64 %elt, i64 addrspace(1)* %out
-  ret void
-}
-
-; GCN: 'extractelement_v4i8'
-; GCN: estimated cost of 1 for {{.*}} extractelement <4 x i8>
-define amdgpu_kernel void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
-  %vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr
-  %elt = extractelement <4 x i8> %vec, i8 1
-  store i8 %elt, i8 addrspace(1)* %out
-  ret void
-}
-
-; GCN: 'extractelement_0_v2i16':
-; CI: estimated cost of 1 for {{.*}} extractelement <2 x i16> %vec, i16 0
-; GFX89: estimated cost of 0 for {{.*}} extractelement <2 x i16>
-define amdgpu_kernel void @extractelement_0_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
-  %elt = extractelement <2 x i16> %vec, i16 0
-  store i16 %elt, i16 addrspace(1)* %out
-  ret void
-}
-
-; GCN: 'extractelement_1_v2i16':
-; GCN: estimated cost of 1 for {{.*}} extractelement <2 x i16>
-define amdgpu_kernel void @extractelement_1_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
-  %elt = extractelement <2 x i16> %vec, i16 1
-  store i16 %elt, i16 addrspace(1)* %out
-  ret void
-}
-
-; GCN: 'extractelement_var_v2i16'
-; GCN: estimated cost of 1 for {{.*}} extractelement <2 x i16>
-define amdgpu_kernel void @extractelement_var_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, i32 %idx) {
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
-  %elt = extractelement <2 x i16> %vec, i32 %idx
-  store i16 %elt, i16 addrspace(1)* %out
+; END.
+
+; GCN-LABEL: 'extractelement_32'
+; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <2 x i32>
+; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <2 x float>
+; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <3 x i32>
+; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <4 x i32>
+; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <5 x i32>
+; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <8 x i32>
+; GCN-NEXT: estimated cost of 2 for {{.*}} extractelement <8 x i32>
+define amdgpu_kernel void @extractelement_32(i32 %arg) {
+  %v2i32_1 = extractelement <2 x i32> undef, i32 1
+  %v2f32_1 = extractelement <2 x float> undef, i32 1
+  %v3i32_1 = extractelement <3 x i32> undef, i32 1
+  %v4i32_1 = extractelement <4 x i32> undef, i32 1
+  %v5i32_1 = extractelement <5 x i32> undef, i32 1
+  %v8i32_1 = extractelement <8 x i32> undef, i32 1
+  %v8i32_a = extractelement <8 x i32> undef, i32 %arg
+  ret void
+}
+
+; GCN-LABEL: 'extractelement_64'
+; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <2 x i64>
+; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <3 x i64>
+; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <4 x i64>
+; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <8 x i64>
+define amdgpu_kernel void @extractelement_64() {
+  %v2i64_1 = extractelement <2 x i64> undef, i64 1
+  %v3i64_1 = extractelement <3 x i64> undef, i64 1
+  %v4i64_1 = extractelement <4 x i64> undef, i64 1
+  %v8i64_1 = extractelement <8 x i64> undef, i64 1
+  ret void
+}
+
+; GCN-LABEL: 'extractelement_8'
+; GCN-NEXT: estimated cost of 1 for {{.*}} extractelement <4 x i8>
+define amdgpu_kernel void @extractelement_8() {
+  %v4i8_1 = extractelement <4 x i8> undef, i8 1
+  ret void
+}
+
+; GCN-LABEL: 'extractelement_16'
+; CI-NEXT: estimated cost of 1 for {{.*}} extractelement <2 x i16> undef, i16 0
+; GFX89-NEXT: estimated cost of 0 for {{.*}} extractelement <2 x i16>
+; GCN-NEXT: estimated cost of 1 for {{.*}} extractelement <2 x i16>
+; GCN-NEXT: estimated cost of 1 for {{.*}} extractelement <2 x i16>
+define amdgpu_kernel void @extractelement_16(i32 %arg) {
+  %v2i16_0 = extractelement <2 x i16> undef, i16 0
+  %v2i16_1 = extractelement <2 x i16> undef, i16 1
+  %v2i16_a = extractelement <2 x i16> undef, i32 %arg
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll b/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll
index de5381c2102ae..a616d455ce80c 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll
@@ -1,93 +1,39 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
+; END.
 
 ; CHECK-LABEL: 'fabs_f32'
 ; CHECK: estimated cost of 0 for {{.*}} call float @llvm.fabs.f32
-define amdgpu_kernel void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
-  %vec = load float, float addrspace(1)* %vaddr
-  %fabs = call float @llvm.fabs.f32(float %vec) #1
-  store float %fabs, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: 'fabs_v2f32'
 ; CHECK: estimated cost of 0 for {{.*}} call <2 x float> @llvm.fabs.v2f32
-define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
-  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
-  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %vec) #1
-  store <2 x float> %fabs, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: 'fabs_v3f32'
 ; CHECK: estimated cost of 0 for {{.*}} call <3 x float> @llvm.fabs.v3f32
-define amdgpu_kernel void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
-  %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
-  %fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %vec) #1
-  store <3 x float> %fabs, <3 x float> addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: 'fabs_v5f32'
 ; CHECK: estimated cost of 0 for {{.*}} call <5 x float> @llvm.fabs.v5f32
-define amdgpu_kernel void @fabs_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) #0 {
-  %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
-  %fabs = call <5 x float> @llvm.fabs.v5f32(<5 x float> %vec) #1
-  store <5 x float> %fabs, <5 x float> addrspace(1)* %out
+define amdgpu_kernel void @fabs_f32() #0 {
+  %f32 = call float @llvm.fabs.f32(float undef) #1
+  %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef) #1
+  %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef) #1
+  %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef) #1
   ret void
 }
 
 ; CHECK-LABEL: 'fabs_f64'
 ; CHECK: estimated cost of 0 for {{.*}} call double @llvm.fabs.f64
-define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
-  %vec = load double, double addrspace(1)* %vaddr
-  %fabs = call double @llvm.fabs.f64(double %vec) #1
-  store double %fabs, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: 'fabs_v2f64'
 ; CHECK: estimated cost of 0 for {{.*}} call <2 x double> @llvm.fabs.v2f64
-define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
-  %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
-  %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %vec) #1
-  store <2 x double> %fabs, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: 'fabs_v3f64'
 ; CHECK: estimated cost of 0 for {{.*}} call <3 x double> @llvm.fabs.v3f64
-define amdgpu_kernel void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
-  %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
-  %fabs = call <3 x double> @llvm.fabs.v3f64(<3 x double> %vec) #1
-  store <3 x double> %fabs, <3 x double> addrspace(1)* %out
+define amdgpu_kernel void @fabs_f64() #0 {
+  %f64 = call double @llvm.fabs.f64(double undef) #1
+  %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef) #1
+  %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef) #1
   ret void
 }
 
 ; CHECK-LABEL: 'fabs_f16'
 ; CHECK: estimated cost of 0 for {{.*}} call half @llvm.fabs.f16
-define amdgpu_kernel void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
-  %vec = load half, half addrspace(1)* %vaddr
-  %fabs = call half @llvm.fabs.f16(half %vec) #1
-  store half %fabs, half addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: 'fabs_v2f16'
 ; CHECK: estimated cost of 0 for {{.*}} call <2 x half> @llvm.fabs.v2f16
-define amdgpu_kernel void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
-  %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
-  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %vec) #1
-  store <2 x half> %fabs, <2 x half> addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: 'fabs_v3f16'
 ; CHECK: estimated cost of 0 for {{.*}} call <3 x half> @llvm.fabs.v3f16
-define amdgpu_kernel void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
-  %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
-  %fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %vec) #1
-  store <3 x half> %fabs, <3 x half> addrspace(1)* %out
+define amdgpu_kernel void @fabs_f16() #0 {
+  %f16 = call half @llvm.fabs.f16(half undef) #1
+  %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef) #1
+  %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef) #1
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
index 8eb1a07691707..b79a09c2c31f3 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
@@ -3,47 +3,25 @@
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,SIZEALL,ALL %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF16,SIZEALL,ALL %s
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX90A-FASTF64,FASTF16,PACKEDF32,ALL %s
+; END.
 
 ; ALL-LABEL: 'fadd_f32'
 ; ALL: estimated cost of 1 for {{.*}} fadd float
-define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
-  %vec = load float, float addrspace(1)* %vaddr
-  %add = fadd float %vec, %b
-  store float %add, float addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fadd_v2f32'
 ; NOPACKEDF32: estimated cost of 2 for {{.*}} fadd <2 x float>
 ; PACKEDF32: estimated cost of 1 for {{.*}} fadd <2 x float>
-define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
-  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
-  %add = fadd <2 x float> %vec, %b
-  store <2 x float> %add, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fadd_v3f32'
 ; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
 ; and 3 when it is legal.
 ; NOPACKEDF32: estimated cost of {{[34]}} for {{.*}} fadd <3 x float>
 ; PACKEDF32: estimated cost of 2 for {{.*}} fadd <3 x float>
-define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
-  %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
-  %add = fadd <3 x float> %vec, %b
-  store <3 x float> %add, <3 x float> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fadd_v5f32'
 ; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
 ; and 5 when it is legal.
 ; NOPACKEDF32: estimated cost of {{[58]}} for {{.*}} fadd <5 x float>
 ; PACKEDF32: estimated cost of 3 for {{.*}} fadd <5 x float>
-define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
-  %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
-  %add = fadd <5 x float> %vec, %b
-  store <5 x float> %add, <5 x float> addrspace(1)* %out
+define amdgpu_kernel void @fadd_f32() #0 {
+  %f32 = fadd float undef, undef
+  %v2f32 = fadd <2 x float> undef, undef
+  %v3f32 = fadd <3 x float> undef, undef
+  %v5f32 = fadd <5 x float> undef, undef
   ret void
 }
 
@@ -52,73 +30,34 @@ define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float
 ; FASTF64: estimated cost of 2 for {{.*}} fadd double
 ; SLOWF64: estimated cost of 4 for {{.*}} fadd double
 ; SIZEALL: estimated cost of 2 for {{.*}} fadd double
-define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
-  %vec = load double, double addrspace(1)* %vaddr
-  %add = fadd double %vec, %b
-  store double %add, double addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fadd_v2f64'
 ; GFX90A-FASTF64: estimated cost of 2 for {{.*}} fadd <2 x double>
 ; FASTF64: estimated cost of 4 for {{.*}} fadd <2 x double>
 ; SLOWF64: estimated cost of 8 for {{.*}} fadd <2 x double>
 ; SIZEALL: estimated cost of 4 for {{.*}} fadd <2 x double>
-define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
-  %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
-  %add = fadd <2 x double> %vec, %b
-  store <2 x double> %add, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fadd_v3f64'
 ; GFX90A-FASTF64: estimated cost of 3 for {{.*}} fadd <3 x double>
 ; FASTF64: estimated cost of 6 for {{.*}} fadd <3 x double>
 ; SLOWF64: estimated cost of 12 for {{.*}} fadd <3 x double>
 ; SIZEALL: estimated cost of 6 for {{.*}} fadd <3 x double>
-define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
-  %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
-  %add = fadd <3 x double> %vec, %b
-  store <3 x double> %add, <3 x double> addrspace(1)* %out
+define amdgpu_kernel void @fadd_f64() #0 {
+  %f64 = fadd double undef, undef
+  %v2f64 = fadd <2 x double> undef, undef
+  %v3f64 = fadd <3 x double> undef, undef
   ret void
 }
 
 ; ALL-LABEL: 'fadd_f16'
 ; ALL: estimated cost of 1 for {{.*}} fadd half
-define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
-  %vec = load half, half addrspace(1)* %vaddr
-  %add = fadd half %vec, %b
-  store half %add, half addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fadd_v2f16'
 ; SLOWF16: estimated cost of 2 for {{.*}} fadd <2 x half>
 ; FASTF16: estimated cost of 1 for {{.*}} fadd <2 x half>
-define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
-  %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
-  %add = fadd <2 x half> %vec, %b
-  store <2 x half> %add, <2 x half> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fadd_v3f16'
 ; SLOWF16: estimated cost of 4 for {{.*}} fadd <3 x half>
 ; FASTF16: estimated cost of 2 for {{.*}} fadd <3 x half>
-define amdgpu_kernel void @fadd_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 {
-  %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
-  %add = fadd <3 x half> %vec, %b
-  store <3 x half> %add, <3 x half> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fadd_v4f16'
 ; SLOWF16: estimated cost of 4 for {{.*}} fadd <4 x half>
 ; FASTF16: estimated cost of 2 for {{.*}} fadd <4 x half>
-define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
-  %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
-  %add = fadd <4 x half> %vec, %b
-  store <4 x half> %add, <4 x half> addrspace(1)* %out
+define amdgpu_kernel void @fadd_f16() #0 {
+  %f16 = fadd half undef, undef
+  %v2f16 = fadd <2 x half> undef, undef
+  %v3f16 = fadd <3 x half> undef, undef
+  %v4f16 = fadd <4 x half> undef, undef
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
index 883db92932a8f..d4836a9d69049 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
@@ -9,84 +9,39 @@
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZESI,SIZENOF16  %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZESI,SIZENOF16  %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZECI,SIZEF16 %s
+; END.
 
 ; ALL-LABEL: 'fdiv_f32_ieee'
 ; THRPTALL: estimated cost of 14 for {{.*}} fdiv float
+; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float>
+; THRPTALL: estimated cost of 42 for {{.*}} fdiv <3 x float>
+; THRPTALL: estimated cost of 70 for {{.*}} fdiv <5 x float>
 ; SIZEALL: estimated cost of 12 for {{.*}} fdiv float
-define amdgpu_kernel void @fdiv_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
-  %vec = load float, float addrspace(1)* %vaddr
-  %add = fdiv float %vec, %b
-  store float %add, float addrspace(1)* %out
+; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float>
+; SIZEALL: estimated cost of 36 for {{.*}} fdiv <3 x float>
+; SIZEALL: estimated cost of 60 for {{.*}} fdiv <5 x float>
+define amdgpu_kernel void @fdiv_f32_ieee() #0 {
+  %f32 = fdiv float undef, undef
+  %v2f32 = fdiv <2 x float> undef, undef
+  %v3f32 = fdiv <3 x float> undef, undef
+  %v5f32 = fdiv <5 x float> undef, undef
   ret void
 }
 
 ; ALL-LABEL: 'fdiv_f32_ftzdaz'
 ; THRPTALL: estimated cost of 16 for {{.*}} fdiv float
 ; SIZEALL: estimated cost of 14 for {{.*}} fdiv float
-define amdgpu_kernel void @fdiv_f32_ftzdaz(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #1 {
-  %vec = load float, float addrspace(1)* %vaddr
-  %add = fdiv float %vec, %b
-  store float %add, float addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fdiv_v2f32_ieee'
-; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float>
-; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float>
-define amdgpu_kernel void @fdiv_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
-  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
-  %add = fdiv <2 x float> %vec, %b
-  store <2 x float> %add, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fdiv_v2f32_ftzdaz'
 ; THRPTALL: estimated cost of 32 for {{.*}} fdiv <2 x float>
 ; SIZEALL: estimated cost of 28 for {{.*}} fdiv <2 x float>
-define amdgpu_kernel void @fdiv_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #1 {
-  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
-  %add = fdiv <2 x float> %vec, %b
-  store <2 x float> %add, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fdiv_v3f32_ieee'
-; THRPTALL: estimated cost of 42 for {{.*}} fdiv <3 x float>
-; SIZEALL: estimated cost of 36 for {{.*}} fdiv <3 x float>
-define amdgpu_kernel void @fdiv_v3f32_ieee(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
-  %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
-  %add = fdiv <3 x float> %vec, %b
-  store <3 x float> %add, <3 x float> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fdiv_v3f32_ftzdaz'
 ; THRPTALL: estimated cost of 48 for {{.*}} fdiv <3 x float>
 ; SIZEALL: estimated cost of 42 for {{.*}} fdiv <3 x float>
-define amdgpu_kernel void @fdiv_v3f32_ftzdaz(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #1 {
-  %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
-  %add = fdiv <3 x float> %vec, %b
-  store <3 x float> %add, <3 x float> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fdiv_v5f32_ieee'
-; THRPTALL: estimated cost of 70 for {{.*}} fdiv <5 x float>
-; SIZEALL: estimated cost of 60 for {{.*}} fdiv <5 x float>
-define amdgpu_kernel void @fdiv_v5f32_ieee(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
-  %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
-  %add = fdiv <5 x float> %vec, %b
-  store <5 x float> %add, <5 x float> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fdiv_v5f32_ftzdaz'
 ; THRPTALL: estimated cost of 80 for {{.*}} fdiv <5 x float>
 ; SIZEALL: estimated cost of 70 for {{.*}} fdiv <5 x float>
-define amdgpu_kernel void @fdiv_v5f32_ftzdaz(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #1 {
-  %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
-  %add = fdiv <5 x float> %vec, %b
-  store <5 x float> %add, <5 x float> addrspace(1)* %out
+define amdgpu_kernel void @fdiv_f32_ftzdaz() #1 {
+  %f32 = fdiv float undef, undef
+  %v2f32 = fdiv <2 x float> undef, undef
+  %v3f32 = fdiv <3 x float> undef, undef
+  %v5f32 = fdiv <5 x float> undef, undef
   ret void
 }
 
@@ -97,208 +52,107 @@ define amdgpu_kernel void @fdiv_v5f32_ftzdaz(<5 x float> addrspace(1)* %out, <5
 ; SISLOWF64: estimated cost of 41 for {{.*}} fdiv double
 ; SIZECI: estimated cost of 22 for {{.*}} fdiv double
 ; SIZESI: estimated cost of 25 for {{.*}} fdiv double
-define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
-  %vec = load double, double addrspace(1)* %vaddr
-  %add = fdiv double %vec, %b
-  store double %add, double addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fdiv_v2f64'
 ; CIFASTF64: estimated cost of 48 for {{.*}} fdiv <2 x double>
 ; CISLOWF64: estimated cost of 76 for {{.*}} fdiv <2 x double>
 ; SIFASTF64: estimated cost of 54 for {{.*}} fdiv <2 x double>
 ; SISLOWF64: estimated cost of 82 for {{.*}} fdiv <2 x double>
 ; SIZECI: estimated cost of 44 for {{.*}} fdiv <2 x double>
 ; SIZESI: estimated cost of 50 for {{.*}} fdiv <2 x double>
-define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
-  %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
-  %add = fdiv <2 x double> %vec, %b
-  store <2 x double> %add, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fdiv_v3f64'
 ; CIFASTF64: estimated cost of 72 for {{.*}} fdiv <3 x double>
 ; CISLOWF64: estimated cost of 114 for {{.*}} fdiv <3 x double>
 ; SIFASTF64: estimated cost of 81 for {{.*}} fdiv <3 x double>
 ; SISLOWF64: estimated cost of 123 for {{.*}} fdiv <3 x double>
 ; SIZECI: estimated cost of 66 for {{.*}} fdiv <3 x double>
 ; SIZESI: estimated cost of 75 for {{.*}} fdiv <3 x double>
-define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
-  %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
-  %add = fdiv <3 x double> %vec, %b
-  store <3 x double> %add, <3 x double> addrspace(1)* %out
+define amdgpu_kernel void @fdiv_f64() #0 {
+  %f64 = fdiv double undef, undef
+  %v2f64 = fdiv <2 x double> undef, undef
+  %v3f64 = fdiv <3 x double> undef, undef
   ret void
 }
 
-; ALL-LABEL: 'fdiv_f16_f32_ieee'
+; ALL-LABEL: 'fdiv_f16_f32ieee'
 ; NOFP16: estimated cost of 14 for {{.*}} fdiv half
 ; FP16: estimated cost of 12 for {{.*}} fdiv half
 ; SIZENOF16: estimated cost of 12 for {{.*}} fdiv half
 ; SIZEF16: estimated cost of 8 for {{.*}} fdiv half
-define amdgpu_kernel void @fdiv_f16_f32_ieee(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
-  %vec = load half, half addrspace(1)* %vaddr
-  %add = fdiv half %vec, %b
-  store half %add, half addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fdiv_f16_f32_ftzdaz'
-; NOFP16: estimated cost of 16 for {{.*}} fdiv half
-; FP16: estimated cost of 12 for {{.*}} fdiv half
-; SIZENOF16: estimated cost of 14 for {{.*}} fdiv half
-; SIZEF16: estimated cost of 8 for {{.*}} fdiv half
-define amdgpu_kernel void @fdiv_f16_f32_ftzdaz(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #1 {
-  %vec = load half, half addrspace(1)* %vaddr
-  %add = fdiv half %vec, %b
-  store half %add, half addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fdiv_v2f16_f32_ieee'
 ; NOFP16: estimated cost of 28 for {{.*}} fdiv <2 x half>
 ; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half>
 ; SIZENOF16: estimated cost of 24 for {{.*}} fdiv <2 x half>
 ; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half>
-define amdgpu_kernel void @fdiv_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
-  %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
-  %add = fdiv <2 x half> %vec, %b
-  store <2 x half> %add, <2 x half> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fdiv_v2f16_f32_ftzdaz'
-; NOFP16: estimated cost of 32 for {{.*}} fdiv <2 x half>
-; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half>
-; SIZENOF16: estimated cost of 28 for {{.*}} fdiv <2 x half>
-; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half>
-define amdgpu_kernel void @fdiv_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #1 {
-  %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
-  %add = fdiv <2 x half> %vec, %b
-  store <2 x half> %add, <2 x half> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fdiv_v4f16_f32_ieee'
 ; NOFP16: estimated cost of 56 for {{.*}} fdiv <4 x half>
 ; FP16: estimated cost of 48 for {{.*}} fdiv <4 x half>
 ; SIZENOF16: estimated cost of 48 for {{.*}} fdiv <4 x half>
 ; SIZEF16: estimated cost of 32 for {{.*}} fdiv <4 x half>
-define amdgpu_kernel void @fdiv_v4f16_f32_ieee(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
-  %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
-  %add = fdiv <4 x half> %vec, %b
-  store <4 x half> %add, <4 x half> addrspace(1)* %out
+define amdgpu_kernel void @fdiv_f16_f32ieee() #0 {
+  %f16 = fdiv half undef, undef
+  %v2f16 = fdiv <2 x half> undef, undef
+  %v4f16 = fdiv <4 x half> undef, undef
   ret void
 }
 
-; ALL-LABEL: 'fdiv_v4f16_f32_ftzdaz'
+; ALL-LABEL: 'fdiv_f16_f32ftzdaz'
+; NOFP16: estimated cost of 16 for {{.*}} fdiv half
+; FP16: estimated cost of 12 for {{.*}} fdiv half
+; SIZENOF16: estimated cost of 14 for {{.*}} fdiv half
+; SIZEF16: estimated cost of 8 for {{.*}} fdiv half
+; NOFP16: estimated cost of 32 for {{.*}} fdiv <2 x half>
+; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half>
+; SIZENOF16: estimated cost of 28 for {{.*}} fdiv <2 x half>
+; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half>
 ; NOFP16: estimated cost of 64 for {{.*}} fdiv <4 x half>
 ; FP16: estimated cost of 48 for {{.*}} fdiv <4 x half>
 ; SIZENOF16: estimated cost of 56 for {{.*}} fdiv <4 x half>
 ; SIZEF16: estimated cost of 32 for {{.*}} fdiv <4 x half>
-define amdgpu_kernel void @fdiv_v4f16_f32_ftzdaz(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #1 {
-  %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
-  %add = fdiv <4 x half> %vec, %b
-  store <4 x half> %add, <4 x half> addrspace(1)* %out
+define amdgpu_kernel void @fdiv_f16_f32ftzdaz() #1 {
+  %f16 = fdiv half undef, undef
+  %v2f16 = fdiv <2 x half> undef, undef
+  %v4f16 = fdiv <4 x half> undef, undef
   ret void
 }
 
-; ALL-LABEL: 'rcp_f32_ieee'
+; ALL-LABEL: 'rcp_ieee'
 ; THRPTALL: estimated cost of 14 for {{.*}} fdiv float
 ; SIZEALL: estimated cost of 12 for {{.*}} fdiv float
-define amdgpu_kernel void @rcp_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
-  %vec = load float, float addrspace(1)* %vaddr
-  %add = fdiv float 1.0, %vec
-  store float %add, float addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'rcp_f32_ftzdaz'
-; THRPTALL: estimated cost of 4 for {{.*}} fdiv float
-; SIZEALL: estimated cost of 2 for {{.*}} fdiv float
-define amdgpu_kernel void @rcp_f32_ftzdaz(float addrspace(1)* %out, float addrspace(1)* %vaddr) #1 {
-  %vec = load float, float addrspace(1)* %vaddr
-  %add = fdiv float 1.0, %vec
-  store float %add, float addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'rcp_f16_f32_ieee'
 ; NOFP16: estimated cost of 14 for {{.*}} fdiv half
 ; FP16: estimated cost of 4 for {{.*}} fdiv half
 ; SIZENOF16: estimated cost of 12 for {{.*}} fdiv half
 ; SIZEF16: estimated cost of 2 for {{.*}} fdiv half
-define amdgpu_kernel void @rcp_f16_f32_ieee(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
-  %vec = load half, half addrspace(1)* %vaddr
-  %add = fdiv half 1.0, %vec
-  store half %add, half addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'rcp_f16_f32_ftzdaz'
-; THRPTALL: estimated cost of 4 for {{.*}} fdiv half
-; SIZEALL: estimated cost of 2 for {{.*}} fdiv half
-define amdgpu_kernel void @rcp_f16_f32_ftzdaz(half addrspace(1)* %out, half addrspace(1)* %vaddr) #1 {
-  %vec = load half, half addrspace(1)* %vaddr
-  %add = fdiv half 1.0, %vec
-  store half %add, half addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'rcp_f64'
 ; CIFASTF64: estimated cost of 24 for {{.*}} fdiv double
 ; CISLOWF64: estimated cost of 38 for {{.*}} fdiv double
 ; SIFASTF64: estimated cost of 27 for {{.*}} fdiv double
 ; SISLOWF64: estimated cost of 41 for {{.*}} fdiv double
 ; SIZECI: estimated cost of 22 for {{.*}} fdiv double
 ; SIZESI: estimated cost of 25 for {{.*}} fdiv double
-define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
-  %vec = load double, double addrspace(1)* %vaddr
-  %add = fdiv double 1.0, %vec
-  store double %add, double addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'rcp_v2f32_ieee'
 ; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float>
 ; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float>
-define amdgpu_kernel void @rcp_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
-  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
-  %add = fdiv <2 x float> <float 1.0, float 1.0>, %vec
-  store <2 x float> %add, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'rcp_v2f32_ftzdaz'
-; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x float>
-; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x float>
-define amdgpu_kernel void @rcp_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #1 {
-  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
-  %add = fdiv <2 x float> <float 1.0, float 1.0>, %vec
-  store <2 x float> %add, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'rcp_v2f16_f32_ieee'
 ; NOFP16: estimated cost of 28 for {{.*}} fdiv <2 x half>
 ; FP16: estimated cost of 8 for {{.*}} fdiv <2 x half>
 ; SIZENOF16: estimated cost of 24 for {{.*}} fdiv <2 x half>
 ; SIZEF16: estimated cost of 4 for {{.*}} fdiv <2 x half>
-define amdgpu_kernel void @rcp_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
-  %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
-  %add = fdiv <2 x half> <half 1.0, half 1.0>, %vec
-  store <2 x half> %add, <2 x half> addrspace(1)* %out
+define amdgpu_kernel void @rcp_ieee() #0 {
+  %f32 = fdiv float 1.0, undef
+  %f16 = fdiv half 1.0, undef
+  %f64 = fdiv double 1.0, undef
+  %v2f32 = fdiv <2 x float> <float 1.0, float 1.0>, undef
+  %v2f16 = fdiv <2 x half> <half 1.0, half 1.0>, undef
   ret void
 }
 
-; ALL-LABEL: 'rcp_v2f16_f32_ftzdaz'
+; ALL-LABEL: 'rcp_ftzdaz'
+; THRPTALL: estimated cost of 4 for {{.*}} fdiv float
+; SIZEALL: estimated cost of 2 for {{.*}} fdiv float
+; THRPTALL: estimated cost of 4 for {{.*}} fdiv half
+; SIZEALL: estimated cost of 2 for {{.*}} fdiv half
+; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x float>
+; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x float>
 ; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x half>
 ; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x half>
-define amdgpu_kernel void @rcp_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #1 {
-  %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
-  %add = fdiv <2 x half> <half 1.0, half 1.0>, %vec
-  store <2 x half> %add, <2 x half> addrspace(1)* %out
+define amdgpu_kernel void @rcp_ftzdaz() #1 {
+  %f32 = fdiv float 1.0, undef
+  %f16 = fdiv half 1.0, undef
+  %v2f32 = fdiv <2 x float> <float 1.0, float 1.0>, undef
+  %v2f16 = fdiv <2 x half> <half 1.0, half 1.0>, undef
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
index c90ca1412effa..1758663ffeff5 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
@@ -3,48 +3,26 @@
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZEF16 %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZENOF16 %s
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX90A-FASTF64,FASTF16,PACKEDF32,ALL %s
+; END.
 
 ; ALL-LABEL: 'fma_f32'
 ; SLOWF32: estimated cost of 4 for {{.*}} call float @llvm.fma.f32
 ; FASTF32: estimated cost of 2 for {{.*}} call float @llvm.fma.f32
 ; SIZEALL: estimated cost of 2 for {{.*}} call float @llvm.fma.f32
-define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
-  %vec = load float, float addrspace(1)* %vaddr
-  %fma = call float @llvm.fma.f32(float %vec, float %vec, float %vec) #1
-  store float %fma, float addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fma_v2f32'
 ; SLOWF32: estimated cost of 8 for {{.*}} call <2 x float> @llvm.fma.v2f32
 ; PACKEDF32: estimated cost of 2 for {{.*}} call <2 x float> @llvm.fma.v2f32
 ; SIZEALL: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32
-define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
-  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
-  %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %vec, <2 x float> %vec, <2 x float> %vec) #1
-  store <2 x float> %fma, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fma_v3f32'
 ; SLOWF32: estimated cost of 12 for {{.*}} call <3 x float> @llvm.fma.v3f32
 ; PACKEDF32: estimated cost of 4 for {{.*}} call <3 x float> @llvm.fma.v3f32
 ; SIZEALL: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32
-define amdgpu_kernel void @fma_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
-  %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
-  %fma = call <3 x float> @llvm.fma.v3f32(<3 x float> %vec, <3 x float> %vec, <3 x float> %vec) #1
-  store <3 x float> %fma, <3 x float> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fma_v5f32'
 ; SLOWF32: estimated cost of 20 for {{.*}} call <5 x float> @llvm.fma.v5f32
 ; PACKEDF32: estimated cost of 6 for {{.*}} call <5 x float> @llvm.fma.v5f32
 ; SIZEALL: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32
-define amdgpu_kernel void @fma_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) #0 {
-  %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
-  %fma = call <5 x float> @llvm.fma.v5f32(<5 x float> %vec, <5 x float> %vec, <5 x float> %vec) #1
-  store <5 x float> %fma, <5 x float> addrspace(1)* %out
+define amdgpu_kernel void @fma_f32() #0 {
+  %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #1
+  %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #1
+  %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #1
+  %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #1
   ret void
 }
 
@@ -53,33 +31,17 @@ define amdgpu_kernel void @fma_v5f32(<5 x float> addrspace(1)* %out, <5 x float>
 ; GFX90A-FASTF64: estimated cost of 1 for {{.*}} call double @llvm.fma.f64
 ; FASTF64: estimated cost of 2 for {{.*}} call double @llvm.fma.f64
 ; SIZEALL: estimated cost of 2 for {{.*}} call double @llvm.fma.f64
-define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
-  %vec = load double, double addrspace(1)* %vaddr
-  %fma = call double @llvm.fma.f64(double %vec, double %vec, double %vec) #1
-  store double %fma, double addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fma_v2f64'
 ; SLOWF64: estimated cost of 8 for {{.*}} call <2 x double> @llvm.fma.v2f64
 ; GFX90A-FASTF64: estimated cost of 2 for {{.*}} call <2 x double> @llvm.fma.v2f64
 ; FASTF64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64
 ; SIZEALL: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64
-define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
-  %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
-  %fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %vec, <2 x double> %vec, <2 x double> %vec) #1
-  store <2 x double> %fma, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fma_v3f64'
 ; SLOWF64: estimated cost of 12 for {{.*}} call <3 x double> @llvm.fma.v3f64
 ; FASTF64: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64
 ; SIZEALL: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64
-define amdgpu_kernel void @fma_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
-  %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
-  %fma = call <3 x double> @llvm.fma.v3f64(<3 x double> %vec, <3 x double> %vec, <3 x double> %vec) #1
-  store <3 x double> %fma, <3 x double> addrspace(1)* %out
+define amdgpu_kernel void @fma_f64() #0 {
+  %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #1
+  %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #1
+  %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #1
   ret void
 }
 
@@ -87,34 +49,18 @@ define amdgpu_kernel void @fma_v3f64(<3 x double> addrspace(1)* %out, <3 x doubl
 ; SLOWF16: estimated cost of 4 for {{.*}} call half @llvm.fma.f16
 ; FASTF16: estimated cost of 2 for {{.*}} call half @llvm.fma.f16
 ; SIZEALL: estimated cost of 2 for {{.*}} call half @llvm.fma.f16
-define amdgpu_kernel void @fma_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
-  %vec = load half, half addrspace(1)* %vaddr
-  %fma = call half @llvm.fma.f16(half %vec, half %vec, half %vec) #1
-  store half %fma, half addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fma_v2f16'
 ; SLOWF16: estimated cost of 8 for {{.*}} call <2 x half> @llvm.fma.v2f16
 ; FASTF16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16
 ; SIZEF16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16
 ; SIZENOF16: estimated cost of 4 for {{.*}} call <2 x half> @llvm.fma.v2f16
-define amdgpu_kernel void @fma_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
-  %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
-  %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %vec, <2 x half> %vec, <2 x half> %vec) #1
-  store <2 x half> %fma, <2 x half> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fma_v3f16'
 ; SLOWF16: estimated cost of 16 for {{.*}} call <3 x half> @llvm.fma.v3f16
 ; FASTF16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16
 ; SIZEF16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16
 ; SIZENOF16: estimated cost of 8 for {{.*}} call <3 x half> @llvm.fma.v3f16
-define amdgpu_kernel void @fma_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
-  %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
-  %fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %vec, <3 x half> %vec, <3 x half> %vec) #1
-  store <3 x half> %fma, <3 x half> addrspace(1)* %out
+define amdgpu_kernel void @fma_f16() #0 {
+  %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #1
+  %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #1
+  %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #1
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
index 929a51229e5c1..75c5e76be7411 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
@@ -3,47 +3,25 @@
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,FASTF16 %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SLOWF16 %s
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX90A-FASTF64,FASTF16,PACKEDF32,ALL %s
+; END.
 
 ; ALL-LABEL: 'fmul_f32'
 ; ALL: estimated cost of 1 for {{.*}} fmul float
-define amdgpu_kernel void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
-  %vec = load float, float addrspace(1)* %vaddr
-  %add = fmul float %vec, %b
-  store float %add, float addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fmul_v2f32'
 ; NOPACKEDF32: estimated cost of 2 for {{.*}} fmul <2 x float>
 ; PACKEDF32: estimated cost of 1 for {{.*}} fmul <2 x float>
-define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
-  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
-  %add = fmul <2 x float> %vec, %b
-  store <2 x float> %add, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fmul_v3f32'
-; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
-; and 3 when it is legal.
+;;; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
+;;; and 3 when it is legal.
 ; NOPACKEDF32: estimated cost of {{[34]}} for {{.*}} fmul <3 x float>
 ; PACKEDF32: estimated cost of 2 for {{.*}} fmul <3 x float>
-define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
-  %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
-  %add = fmul <3 x float> %vec, %b
-  store <3 x float> %add, <3 x float> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fmul_v5f32'
-; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
-; and 5 when it is legal.
+;;; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
+;;; and 5 when it is legal.
 ; NOPACKEDF32: estimated cost of {{[58]}} for {{.*}} fmul <5 x float>
 ; PACKEDF32: estimated cost of 3 for {{.*}} fmul <5 x float>
-define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
-  %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
-  %add = fmul <5 x float> %vec, %b
-  store <5 x float> %add, <5 x float> addrspace(1)* %out
+define amdgpu_kernel void @fmul_f32() #0 {
+  %f32 = fmul float undef, undef
+  %v2f32 = fmul <2 x float> undef, undef
+  %v3f32 = fmul <3 x float> undef, undef
+  %v5f32 = fmul <5 x float> undef, undef
   ret void
 }
 
@@ -52,71 +30,32 @@ define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float
 ; FASTF64: estimated cost of 2 for {{.*}} fmul double
 ; SLOWF64: estimated cost of 4 for {{.*}} fmul double
 ; SIZEALL: estimated cost of 2 for {{.*}} fmul double
-define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
-  %vec = load double, double addrspace(1)* %vaddr
-  %add = fmul double %vec, %b
-  store double %add, double addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fmul_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fmul <2 x double>
 ; SLOWF64: estimated cost of 8 for {{.*}} fmul <2 x double>
 ; SIZEALL: estimated cost of 4 for {{.*}} fmul <2 x double>
-define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
-  %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
-  %add = fmul <2 x double> %vec, %b
-  store <2 x double> %add, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fmul_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fmul <3 x double>
 ; SLOWF64: estimated cost of 12 for {{.*}} fmul <3 x double>
 ; SIZEALL: estimated cost of 6 for {{.*}} fmul <3 x double>
-define amdgpu_kernel void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
-  %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
-  %add = fmul <3 x double> %vec, %b
-  store <3 x double> %add, <3 x double> addrspace(1)* %out
+define amdgpu_kernel void @fmul_f64() #0 {
+  %f64 = fmul double undef, undef
+  %v2f64 = fmul <2 x double> undef, undef
+  %v3f64 = fmul <3 x double> undef, undef
   ret void
 }
 
 ; ALL-LABEL: 'fmul_f16'
 ; ALL: estimated cost of 1 for {{.*}} fmul half
-define amdgpu_kernel void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
-  %vec = load half, half addrspace(1)* %vaddr
-  %add = fmul half %vec, %b
-  store half %add, half addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fmul_v2f16'
 ; SLOWF16: estimated cost of 2 for {{.*}} fmul <2 x half>
 ; FASTF16: estimated cost of 1 for {{.*}} fmul <2 x half>
-define amdgpu_kernel void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
-  %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
-  %add = fmul <2 x half> %vec, %b
-  store <2 x half> %add, <2 x half> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fmul_v3f16'
 ; SLOWF16: estimated cost of 4 for {{.*}} fmul <3 x half>
 ; FASTF16: estimated cost of 2 for {{.*}} fmul <3 x half>
-define amdgpu_kernel void @fmul_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 {
-  %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
-  %add = fmul <3 x half> %vec, %b
-  store <3 x half> %add, <3 x half> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fmul_v4f16'
 ; SLOWF16: estimated cost of 4 for {{.*}} fmul <4 x half>
 ; FASTF16: estimated cost of 2 for {{.*}} fmul <4 x half>
-define amdgpu_kernel void @fmul_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
-  %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
-  %add = fmul <4 x half> %vec, %b
-  store <4 x half> %add, <4 x half> addrspace(1)* %out
+define amdgpu_kernel void @fmul_f16() #0 {
+  %f16 = fmul half undef, undef
+  %v2f16 = fmul <2 x half> undef, undef
+  %v3f16 = fmul <3 x half> undef, undef
+  %v4f16 = fmul <4 x half> undef, undef
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll b/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll
index 462a363bebfdc..0038f5b9fa3bd 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll
@@ -1,102 +1,38 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
+; END.
 
 ; CHECK-LABEL: 'fneg_f32'
-; CHECK: estimated cost of 0 for instruction:   %fneg = fneg float
-define amdgpu_kernel void @fneg_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) {
-  %vec = load float, float addrspace(1)* %vaddr
-  %fadd = fadd float %vec, undef
-  %fneg = fneg float %fadd
-  store float %fneg, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: 'fneg_v2f32'
-; CHECK: estimated cost of 0 for instruction:   %fneg = fneg <2 x float>
-define amdgpu_kernel void @fneg_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
-  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
-  %fadd = fadd <2 x float> %vec, undef
-  %fneg = fneg <2 x float> %fadd
-  store <2 x float> %fneg, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: 'fneg_v3f32'
-; CHECK: estimated cost of 0 for instruction:   %fneg = fneg <3 x float>
-define amdgpu_kernel void @fneg_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) {
-  %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
-  %fadd = fadd <3 x float> %vec, undef
-  %fneg = fneg <3 x float> %fadd
-  store <3 x float> %fneg, <3 x float> addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: 'fneg_v5f32'
-; CHECK: estimated cost of 0 for instruction:   %fneg = fneg <5 x float>
-define amdgpu_kernel void @fneg_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) {
-  %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
-  %fadd = fadd <5 x float> %vec, undef
-  %fneg = fneg <5 x float> %fadd
-  store <5 x float> %fneg, <5 x float> addrspace(1)* %out
+; CHECK: estimated cost of 0 for {{.*}} fneg float
+; CHECK: estimated cost of 0 for {{.*}} fneg <2 x float>
+; CHECK: estimated cost of 0 for {{.*}} fneg <3 x float>
+; CHECK: estimated cost of 0 for {{.*}} fneg <5 x float>
+define amdgpu_kernel void @fneg_f32() {
+  %f32 = fneg float undef
+  %v2f32 = fneg <2 x float> undef
+  %v3f32 = fneg <3 x float> undef
+  %v5f32 = fneg <5 x float> undef
   ret void
 }
 
 ; CHECK-LABEL: 'fneg_f64'
-; CHECK: estimated cost of 0 for instruction:   %fneg = fneg double
-define amdgpu_kernel void @fneg_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) {
-  %vec = load double, double addrspace(1)* %vaddr
-  %fadd = fadd double %vec, undef
-  %fneg = fneg double %fadd
-  store double %fneg, double addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: 'fneg_v2f64'
-; CHECK: estimated cost of 0 for instruction:   %fneg = fneg <2 x double>
-define amdgpu_kernel void @fneg_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) {
-  %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
-  %fadd = fadd <2 x double> %vec, undef
-  %fneg = fneg <2 x double> %fadd
-  store <2 x double> %fneg, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: 'fneg_v3f64'
-; CHECK: estimated cost of 0 for instruction:   %fneg = fneg <3 x double>
-define amdgpu_kernel void @fneg_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) {
-  %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
-  %fadd = fadd <3 x double> %vec, undef
-  %fneg = fneg <3 x double> %fadd
-  store <3 x double> %fneg, <3 x double> addrspace(1)* %out
+; CHECK: estimated cost of 0 for {{.*}} fneg double
+; CHECK: estimated cost of 0 for {{.*}} fneg <2 x double>
+; CHECK: estimated cost of 0 for {{.*}} fneg <3 x double>
+define amdgpu_kernel void @fneg_f64() {
+  %f64 = fneg double undef
+  %v2f64 = fneg <2 x double> undef
+  %v3f64 = fneg <3 x double> undef
   ret void
 }
 
 ; CHECK-LABEL: 'fneg_f16'
-; CHECK: estimated cost of 0 for instruction:   %fneg = fneg half
-define amdgpu_kernel void @fneg_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) {
-  %vec = load half, half addrspace(1)* %vaddr
-  %fadd = fadd half %vec, undef
-  %fneg = fneg half %fadd
-  store half %fneg, half addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: 'fneg_v2f16'
-; CHECK: estimated cost of 0 for instruction:   %fneg = fneg <2 x half>
-define amdgpu_kernel void @fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) {
-  %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
-  %fadd = fadd <2 x half> %vec, undef
-  %fneg = fneg <2 x half> %fadd
-  store <2 x half> %fneg, <2 x half> addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: 'fneg_v3f16'
-; CHECK: estimated cost of 0 for instruction:   %fneg = fneg <3 x half>
-define amdgpu_kernel void @fneg_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) {
-  %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
-  %fadd = fadd <3 x half> %vec, undef
-  %fneg = fneg <3 x half> %fadd
-  store <3 x half> %fneg, <3 x half> addrspace(1)* %out
+; CHECK: estimated cost of 0 for {{.*}} fneg half
+; CHECK: estimated cost of 0 for {{.*}} fneg <2 x half>
+; CHECK: estimated cost of 0 for {{.*}} fneg <3 x half>
+define amdgpu_kernel void @fneg_f16() {
+  %f16 = fneg half undef
+  %v2f16 = fneg <2 x half> undef
+  %v3f16 = fneg <3 x half> undef
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
index 287bba8f83b12..27d5a000ef5f8 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
@@ -2,40 +2,18 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
 ; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZEALL,FASTF16,ALL %s
 ; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZEALL,SLOWF16,ALL %s
+; END.
 
 ; ALL-LABEL: 'fsub_f32'
 ; ALL: estimated cost of 1 for {{.*}} fsub float
-define amdgpu_kernel void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
-  %vec = load float, float addrspace(1)* %vaddr
-  %add = fsub float %vec, %b
-  store float %add, float addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fsub_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fsub <2 x float>
-define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
-  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
-  %add = fsub <2 x float> %vec, %b
-  store <2 x float> %add, <2 x float> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fsub_v3f32'
 ; ALL: estimated cost of 3 for {{.*}} fsub <3 x float>
-define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
-  %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
-  %add = fsub <3 x float> %vec, %b
-  store <3 x float> %add, <3 x float> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fsub_v5f32'
 ; ALL: estimated cost of 5 for {{.*}} fsub <5 x float>
-define amdgpu_kernel void @fsub_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
-  %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
-  %add = fsub <5 x float> %vec, %b
-  store <5 x float> %add, <5 x float> addrspace(1)* %out
+define amdgpu_kernel void @fsub_f32() #0 {
+  %f32 = fsub float undef, undef
+  %v2f32 = fsub <2 x float> undef, undef
+  %v3f32 = fsub <3 x float> undef, undef
+  %v5f32 = fsub <5 x float> undef, undef
   ret void
 }
 
@@ -43,70 +21,31 @@ define amdgpu_kernel void @fsub_v5f32(<5 x float> addrspace(1)* %out, <5 x float
 ; FASTF64: estimated cost of 2 for {{.*}} fsub double
 ; SLOWF64: estimated cost of 4 for {{.*}} fsub double
 ; SIZEALL: estimated cost of 2 for {{.*}} fsub double
-define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
-  %vec = load double, double addrspace(1)* %vaddr
-  %add = fsub double %vec, %b
-  store double %add, double addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fsub_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fsub <2 x double>
 ; SLOWF64: estimated cost of 8 for {{.*}} fsub <2 x double>
 ; SIZEALL: estimated cost of 4 for {{.*}} fsub <2 x double>
-define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
-  %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
-  %add = fsub <2 x double> %vec, %b
-  store <2 x double> %add, <2 x double> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fsub_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fsub <3 x double>
 ; SLOWF64: estimated cost of 12 for {{.*}} fsub <3 x double>
 ; SIZEALL: estimated cost of 6 for {{.*}} fsub <3 x double>
-define amdgpu_kernel void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
-  %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
-  %add = fsub <3 x double> %vec, %b
-  store <3 x double> %add, <3 x double> addrspace(1)* %out
+define amdgpu_kernel void @fsub_f64() #0 {
+  %f64 = fsub double undef, undef
+  %v2f64 = fsub <2 x double> undef, undef
+  %v3f64 = fsub <3 x double> undef, undef
   ret void
 }
 
 ; ALL-LABEL: 'fsub_f16'
 ; ALL: estimated cost of 1 for {{.*}} fsub half
-define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
-  %vec = load half, half addrspace(1)* %vaddr
-  %add = fsub half %vec, %b
-  store half %add, half addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fsub_v2f16'
 ; SLOWF16: estimated cost of 2 for {{.*}} fsub <2 x half>
 ; FASTF16: estimated cost of 1 for {{.*}} fsub <2 x half>
-define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
-  %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
-  %add = fsub <2 x half> %vec, %b
-  store <2 x half> %add, <2 x half> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fsub_v3f16'
 ; SLOWF16: estimated cost of 4 for {{.*}} fsub <3 x half>
 ; FASTF16: estimated cost of 2 for {{.*}} fsub <3 x half>
-define amdgpu_kernel void @fsub_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 {
-  %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
-  %add = fsub <3 x half> %vec, %b
-  store <3 x half> %add, <3 x half> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'fsub_v4f16'
 ; SLOWF16: estimated cost of 4 for {{.*}} fsub <4 x half>
 ; FASTF16: estimated cost of 2 for {{.*}} fsub <4 x half>
-define amdgpu_kernel void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
-  %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
-  %add = fsub <4 x half> %vec, %b
-  store <4 x half> %add, <4 x half> addrspace(1)* %out
+define amdgpu_kernel void @fsub_f16() #0 {
+  %f16 = fsub half undef, undef
+  %v2f16 = fsub <2 x half> undef, undef
+  %v3f16 = fsub <3 x half> undef, undef
+  %v4f16 = fsub <4 x half> undef, undef
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll b/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll
index 5fbd7835351e7..52b745bbad3b3 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll
@@ -6,167 +6,109 @@
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,SZNOCONTRACT,SIZEALL,ALL %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,SIZEALL,ALL %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,SZNOCONTRACT,SIZEALL,ALL %s
+; END.
 
 target triple = "amdgcn--"
 
 ; ALL-LABEL: 'fmul_fadd_f32':
-; FUSED: estimated cost of 0 for instruction:   %mul = fmul float
-; SLOW: estimated cost of 1 for instruction:   %mul = fmul float
-; GFX1030: estimated cost of 1 for instruction:   %mul = fmul float
-; ALL: estimated cost of 1 for instruction:   %add = fadd float
-define float @fmul_fadd_f32(float %r0, float %r1, float %r2) #0 {
-  %mul = fmul float %r0, %r1
-  %add = fadd float %mul, %r2
-  ret float %add
-}
-
-; ALL-LABEL: 'fmul_fadd_contract_f32':
-; ALL: estimated cost of 0 for instruction:   %mul = fmul contract float
-; ALL: estimated cost of 1 for instruction:   %add = fadd contract float
-define float @fmul_fadd_contract_f32(float %r0, float %r1, float %r2) #0 {
-  %mul = fmul contract float %r0, %r1
-  %add = fadd contract float %mul, %r2
-  ret float %add
-}
-
-; ALL-LABEL: 'fmul_fadd_v2f32':
-; FUSED: estimated cost of 0 for instruction:   %mul = fmul <2 x float>
-; SLOW: estimated cost of 2 for instruction:   %mul = fmul <2 x float>
-; GFX1030: estimated cost of 2 for instruction:   %mul = fmul <2 x float>
-; ALL: estimated cost of 2 for instruction:   %add = fadd <2 x float>
-define <2 x float> @fmul_fadd_v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) #0 {
-  %mul = fmul <2 x float> %r0, %r1
-  %add = fadd <2 x float> %mul, %r2
-  ret <2 x float> %add
-}
-
-; ALL-LABEL: 'fmul_fsub_f32':
-; FUSED: estimated cost of 0 for instruction:   %mul = fmul float
-; SLOW: estimated cost of 1 for instruction:   %mul = fmul float
-; GFX1030: estimated cost of 1 for instruction:   %mul = fmul float
-; ALL: estimated cost of 1 for instruction:   %sub = fsub float
-define float @fmul_fsub_f32(float %r0, float %r1, float %r2) #0 {
-  %mul = fmul float %r0, %r1
-  %sub = fsub float %mul, %r2
-  ret float %sub
-}
-
-; ALL-LABEL: 'fmul_fsub_v2f32':
-; FUSED: estimated cost of 0 for instruction:   %mul = fmul <2 x float>
-; SLOW: estimated cost of 2 for instruction:   %mul = fmul <2 x float>
-; GFX1030: estimated cost of 2 for instruction:   %mul = fmul <2 x float>
-; ALL: estimated cost of 2 for instruction:   %sub = fsub <2 x float>
-define <2 x float> @fmul_fsub_v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) #0 {
-  %mul = fmul <2 x float> %r0, %r1
-  %sub = fsub <2 x float> %mul, %r2
-  ret <2 x float> %sub
+; FUSED: estimated cost of 0 for {{.*}} fmul float
+; SLOW: estimated cost of 1 for {{.*}} fmul float
+; GFX1030: estimated cost of 1 for {{.*}} fmul float
+; ALL: estimated cost of 1 for {{.*}} fadd float
+; ALL: estimated cost of 0 for {{.*}} fmul contract float
+; ALL: estimated cost of 1 for {{.*}} fadd contract float
+; FUSED: estimated cost of 0 for {{.*}} fmul <2 x float>
+; SLOW: estimated cost of 2 for {{.*}} fmul <2 x float>
+; GFX1030: estimated cost of 2 for {{.*}} fmul <2 x float>
+; ALL: estimated cost of 2 for {{.*}} fadd <2 x float>
+; FUSED: estimated cost of 0 for {{.*}} fmul float
+; SLOW: estimated cost of 1 for {{.*}} fmul float
+; GFX1030: estimated cost of 1 for {{.*}} fmul float
+; ALL: estimated cost of 1 for {{.*}} fsub float
+; FUSED: estimated cost of 0 for {{.*}} fmul <2 x float>
+; SLOW: estimated cost of 2 for {{.*}} fmul <2 x float>
+; GFX1030: estimated cost of 2 for {{.*}} fmul <2 x float>
+; ALL: estimated cost of 2 for {{.*}} fsub <2 x float>
+define void @fmul_fadd_f32() #0 {
+  %f32 = fmul float undef, undef
+  %f32add = fadd float %f32, undef
+  %f32c = fmul contract float undef, undef
+  %f32cadd = fadd contract float %f32c, undef
+  %v2f32 = fmul <2 x float> undef, undef
+  %v2f32add = fadd <2 x float> %v2f32, undef
+  %f32_2 = fmul float undef, undef
+  %f32sub = fsub float %f32_2, undef
+  %v2f32_2 = fmul <2 x float> undef, undef
+  %v2f32sub = fsub <2 x float> %v2f32_2, undef
+  ret void
 }
 
 ; ALL-LABEL: 'fmul_fadd_f16':
-; FUSED: estimated cost of 0 for instruction:   %mul = fmul half
-; SLOW: estimated cost of 1 for instruction:   %mul = fmul half
-; ALL: estimated cost of 1 for instruction:   %add = fadd half
-define half @fmul_fadd_f16(half %r0, half %r1, half %r2) #0 {
-  %mul = fmul half %r0, %r1
-  %add = fadd half %mul, %r2
-  ret half %add
-}
-
-; ALL-LABEL: 'fmul_fadd_contract_f16':
-; ALL: estimated cost of 0 for instruction:   %mul = fmul contract half
-; ALL: estimated cost of 1 for instruction:   %add = fadd contract half
-define half @fmul_fadd_contract_f16(half %r0, half %r1, half %r2) #0 {
-  %mul = fmul contract half %r0, %r1
-  %add = fadd contract half %mul, %r2
-  ret half %add
-}
-
-; ALL-LABEL: 'fmul_fadd_v2f16':
-; FUSED: estimated cost of 0 for instruction:   %mul = fmul <2 x half>
-; SLOW: estimated cost of 1 for instruction:   %mul = fmul <2 x half>
-; ALL: estimated cost of 1 for instruction:   %add = fadd <2 x half>
-define <2 x half> @fmul_fadd_v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2) #0 {
-  %mul = fmul <2 x half> %r0, %r1
-  %add = fadd <2 x half> %mul, %r2
-  ret <2 x half> %add
-}
-
-; ALL-LABEL: 'fmul_fsub_f16':
-; FUSED: estimated cost of 0 for instruction:   %mul = fmul half
-; SLOW: estimated cost of 1 for instruction:   %mul = fmul half
-; ALL: estimated cost of 1 for instruction:   %sub = fsub half
-define half @fmul_fsub_f16(half %r0, half %r1, half %r2) #0 {
-  %mul = fmul half %r0, %r1
-  %sub = fsub half %mul, %r2
-  ret half %sub
-}
-
-; ALL-LABEL: 'fmul_fsub_v2f16':
-; FUSED: estimated cost of 0 for instruction:   %mul = fmul <2 x half>
-; SLOW: estimated cost of 1 for instruction:   %mul = fmul <2 x half>
-; ALL: estimated cost of 1 for instruction:   %sub = fsub <2 x half>
-define <2 x half> @fmul_fsub_v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2) #0 {
-  %mul = fmul <2 x half> %r0, %r1
-  %sub = fsub <2 x half> %mul, %r2
-  ret <2 x half> %sub
+; FUSED: estimated cost of 0 for {{.*}} fmul half
+; SLOW: estimated cost of 1 for {{.*}} fmul half
+; ALL: estimated cost of 1 for {{.*}} fadd half
+; ALL: estimated cost of 0 for {{.*}} fmul contract half
+; ALL: estimated cost of 1 for {{.*}} fadd contract half
+; FUSED: estimated cost of 0 for {{.*}} fmul <2 x half>
+; SLOW: estimated cost of 1 for {{.*}} fmul <2 x half>
+; ALL: estimated cost of 1 for {{.*}} fadd <2 x half>
+; FUSED: estimated cost of 0 for {{.*}} fmul half
+; SLOW: estimated cost of 1 for {{.*}} fmul half
+; ALL: estimated cost of 1 for {{.*}} fsub half
+; FUSED: estimated cost of 0 for {{.*}} fmul <2 x half>
+; SLOW: estimated cost of 1 for {{.*}} fmul <2 x half>
+; ALL: estimated cost of 1 for {{.*}} fsub <2 x half>
+define void @fmul_fadd_f16() #0 {
+  %f16 = fmul half undef, undef
+  %f16add = fadd half %f16, undef
+  %f16c = fmul contract half undef, undef
+  %f15cadd = fadd contract half %f16c, undef
+  %v2f16 = fmul <2 x half> undef, undef
+  %v2f16add = fadd <2 x half> %v2f16, undef
+  %f16_2 = fmul half undef, undef
+  %f16sub = fsub half %f16_2, undef
+  %v2f16_2 = fmul <2 x half> undef, undef
+  %v2f16sub = fsub <2 x half> %v2f16_2, undef
+  ret void
 }
 
 ; ALL-LABEL: 'fmul_fadd_f64':
-; CONTRACT: estimated cost of 0 for instruction:   %mul = fmul double
-; NOCONTRACT: estimated cost of 4 for instruction:   %mul = fmul double
-; SZNOCONTRACT: estimated cost of 2 for instruction:   %mul = fmul double
-; THRPTALL: estimated cost of 4 for instruction:   %add = fadd double
-; SIZEALL: estimated cost of 2 for instruction:   %add = fadd double
-define double @fmul_fadd_f64(double %r0, double %r1, double %r2) #0 {
-  %mul = fmul double %r0, %r1
-  %add = fadd double %mul, %r2
-  ret double %add
-}
-
-; ALL-LABEL: 'fmul_fadd_contract_f64':
-; ALL: estimated cost of 0 for instruction:   %mul = fmul contract double
-; THRPTALL: estimated cost of 4 for instruction:   %add = fadd contract double
-; SIZEALL: estimated cost of 2 for instruction:   %add = fadd contract double
-define double @fmul_fadd_contract_f64(double %r0, double %r1, double %r2) #0 {
-  %mul = fmul contract double %r0, %r1
-  %add = fadd contract double %mul, %r2
-  ret double %add
-}
-
-; ALL-LABEL: 'fmul_fadd_v2f64':
-; CONTRACT: estimated cost of 0 for instruction:   %mul = fmul <2 x double>
-; NOCONTRACT: estimated cost of 8 for instruction:   %mul = fmul <2 x double>
-; SZNOCONTRACT: estimated cost of 4 for instruction:   %mul = fmul <2 x double>
-; THRPTALL: estimated cost of 8 for instruction:   %add = fadd <2 x double>
-; SIZEALL: estimated cost of 4 for instruction:   %add = fadd <2 x double>
-define <2 x double> @fmul_fadd_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 {
-  %mul = fmul <2 x double> %r0, %r1
-  %add = fadd <2 x double> %mul, %r2
-  ret <2 x double> %add
-}
-
-; ALL-LABEL: 'fmul_fsub_f64':
-; CONTRACT: estimated cost of 0 for instruction:   %mul = fmul double
-; NOCONTRACT: estimated cost of 4 for instruction:   %mul = fmul double
-; SZNOCONTRACT: estimated cost of 2 for instruction:   %mul = fmul double
-; THRPTALL: estimated cost of 4 for instruction:   %sub = fsub double
-; SIZEALL: estimated cost of 2 for instruction:   %sub = fsub double
-define double @fmul_fsub_f64(double %r0, double %r1, double %r2) #0 {
-  %mul = fmul double %r0, %r1
-  %sub = fsub double %mul, %r2
-  ret double %sub
-}
-
-; ALL-LABEL: 'fmul_fsub_v2f64':
-; CONTRACT: estimated cost of 0 for instruction:   %mul = fmul <2 x double>
-; NOCONTRACT: estimated cost of 8 for instruction:   %mul = fmul <2 x double>
-; SZNOCONTRACT: estimated cost of 4 for instruction:   %mul = fmul <2 x double>
-; THRPTALL: estimated cost of 8 for instruction:   %sub = fsub <2 x double>
-; SIZEALL: estimated cost of 4 for instruction:   %sub = fsub <2 x double>
-define <2 x double> @fmul_fsub_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 {
-  %mul = fmul <2 x double> %r0, %r1
-  %sub = fsub <2 x double> %mul, %r2
-  ret <2 x double> %sub
+; CONTRACT: estimated cost of 0 for {{.*}} fmul double
+; NOCONTRACT: estimated cost of 4 for {{.*}} fmul double
+; SZNOCONTRACT: estimated cost of 2 for {{.*}} fmul double
+; THRPTALL: estimated cost of 4 for {{.*}} fadd double
+; SIZEALL: estimated cost of 2 for {{.*}} fadd double
+; ALL: estimated cost of 0 for {{.*}} fmul contract double
+; THRPTALL: estimated cost of 4 for {{.*}} fadd contract double
+; SIZEALL: estimated cost of 2 for {{.*}} fadd contract double
+; CONTRACT: estimated cost of 0 for {{.*}} fmul <2 x double>
+; NOCONTRACT: estimated cost of 8 for {{.*}} fmul <2 x double>
+; SZNOCONTRACT: estimated cost of 4 for {{.*}} fmul <2 x double>
+; THRPTALL: estimated cost of 8 for {{.*}} fadd <2 x double>
+; SIZEALL: estimated cost of 4 for {{.*}} fadd <2 x double>
+; CONTRACT: estimated cost of 0 for {{.*}} fmul double
+; NOCONTRACT: estimated cost of 4 for {{.*}} fmul double
+; SZNOCONTRACT: estimated cost of 2 for {{.*}} fmul double
+; THRPTALL: estimated cost of 4 for {{.*}} fsub double
+; SIZEALL: estimated cost of 2 for {{.*}} fsub double
+; CONTRACT: estimated cost of 0 for {{.*}} fmul <2 x double>
+; NOCONTRACT: estimated cost of 8 for {{.*}} fmul <2 x double>
+; SZNOCONTRACT: estimated cost of 4 for {{.*}} fmul <2 x double>
+; THRPTALL: estimated cost of 8 for {{.*}} fsub <2 x double>
+; SIZEALL: estimated cost of 4 for {{.*}} fsub <2 x double>
+define void @fmul_fadd_f64() #0 {
+  %f64 = fmul double undef, undef
+  %f64add = fadd double %f64, undef
+  %f64c = fmul contract double undef, undef
+  %f64cadd = fadd contract double %f64c, undef
+  %v2f64 = fmul <2 x double> undef, undef
+  %v2f64add = fadd <2 x double> %v2f64, undef
+  %f64_2 = fmul double undef, undef
+  %f64sub = fsub double %f64_2, undef
+  %v2f64_2 = fmul <2 x double> undef, undef
+  %v2f64sub = fsub <2 x double> %v2f64_2, undef
+  ret void
 }
 
 attributes #0 = { nounwind }
+
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/insertelement.ll b/llvm/test/Analysis/CostModel/AMDGPU/insertelement.ll
index 7bd86db270aaa..a7d28413319bf 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/insertelement.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/insertelement.ll
@@ -4,49 +4,20 @@
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa %s | FileCheck -check-prefixes=GCN,CI %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=GCN,GFX89 %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GCN,GFX89 %s
+; END.
 
-; GCN-LABEL: 'insertelement_v2i32'
+; GCN-LABEL: 'insertelement_v2'
 ; GCN: estimated cost of 0 for {{.*}} insertelement <2 x i32>
-define amdgpu_kernel void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
-  %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
-  %insert = insertelement <2 x i32> %vec, i32 123, i32 1
-  store <2 x i32> %insert, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: 'insertelement_v2i64'
 ; GCN: estimated cost of 0 for {{.*}} insertelement <2 x i64>
-define amdgpu_kernel void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
-  %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
-  %insert = insertelement <2 x i64> %vec, i64 123, i64 1
-  store <2 x i64> %insert, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: 'insertelement_0_v2i16'
 ; CI: estimated cost of 1 for {{.*}} insertelement <2 x i16>
 ; GFX89: estimated cost of 0 for {{.*}} insertelement <2 x i16>
-define amdgpu_kernel void @insertelement_0_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
-  %insert = insertelement <2 x i16> %vec, i16 123, i16 0
-  store <2 x i16> %insert, <2 x i16> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: 'insertelement_1_v2i16'
 ; GCN: estimated cost of 1 for {{.*}} insertelement <2 x i16>
-define amdgpu_kernel void @insertelement_1_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
-  %insert = insertelement <2 x i16> %vec, i16 123, i16 1
-  store <2 x i16> %insert, <2 x i16> addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: 'insertelement_1_v2i8'
 ; GCN: estimated cost of 1 for {{.*}} insertelement <2 x i8>
-define amdgpu_kernel void @insertelement_1_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
-  %vec = load <2 x i8>, <2 x i8> addrspace(1)* %vaddr
-  %insert = insertelement <2 x i8> %vec, i8 123, i8 1
-  store <2 x i8> %insert, <2 x i8> addrspace(1)* %out
+define amdgpu_kernel void @insertelement_v2() {
+  %v2i32_1 = insertelement <2 x i32> undef, i32 123, i32 1
+  %v2i64_1 = insertelement <2 x i64> undef, i64 123, i64 1
+  %v2i16_0 = insertelement <2 x i16> undef, i16 123, i16 0
+  %v2i16_1 = insertelement <2 x i16> undef, i16 123, i16 1
+  %v2i8_1 = insertelement <2 x i8> undef, i8 123, i8 1
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/logicalop.ll b/llvm/test/Analysis/CostModel/AMDGPU/logicalop.ll
index 21e23d73cdd10..0eedf867502eb 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/logicalop.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/logicalop.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s --check-prefix=CHECK-THROUGHPUT
 ; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s --check-prefix=CHECK-SIZE
+; END.
 
 define amdgpu_kernel void @op() {
   ; Logical and/or - select's cost must be equivalent to that of binop
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
index e4ca0685708f7..33109ff18a2c6 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
@@ -2,139 +2,63 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,THRPTALL,ALL %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SIZESLOW16,SIZEALL,ALL %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=SIZEFAST16,SIZEALL,ALL %s
+; END.
 
 ; ALL-LABEL: 'mul_i32'
 ; THRPTALL: estimated cost of 4 for {{.*}} mul i32
 ; SIZEALL: estimated cost of 2 for {{.*}} mul i32
-define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
-  %vec = load i32, i32 addrspace(1)* %vaddr
-  %mul = mul i32 %vec, %b
-  store i32 %mul, i32 addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'mul_v2i32'
 ; THRPTALL: estimated cost of 8 for {{.*}} mul <2 x i32>
 ; SIZEALL: estimated cost of 4 for {{.*}} mul <2 x i32>
-define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
-  %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
-  %mul = mul <2 x i32> %vec, %b
-  store <2 x i32> %mul, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'mul_v3i32'
 ; THRPTALL: estimated cost of 12 for {{.*}} mul <3 x i32>
 ; SIZEALL: estimated cost of 6 for {{.*}} mul <3 x i32>
-define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
-  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
-  %mul = mul <3 x i32> %vec, %b
-  store <3 x i32> %mul, <3 x i32> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'mul_v5i32'
-; THRPTALL: estimated cost of 20 for {{.*}} mul <5 x i32>
-; SIZEALL: estimated cost of 10 for {{.*}} mul <5 x i32>
-define amdgpu_kernel void @mul_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 {
-  %vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr
-  %mul = mul <5 x i32> %vec, %b
-  store <5 x i32> %mul, <5 x i32> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'mul_v4i32'
 ; THRPTALL: estimated cost of 16 for {{.*}} mul <4 x i32>
 ; SIZEALL: estimated cost of 8 for {{.*}} mul <4 x i32>
-define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
-  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
-  %mul = mul <4 x i32> %vec, %b
-  store <4 x i32> %mul, <4 x i32> addrspace(1)* %out
+; THRPTALL: estimated cost of 20 for {{.*}} mul <5 x i32>
+; SIZEALL: estimated cost of 10 for {{.*}} mul <5 x i32>
+define amdgpu_kernel void @mul_i32() #0 {
+  %i32 = mul i32 undef, undef
+  %v2i32 = mul <2 x i32> undef, undef
+  %v3i32 = mul <3 x i32> undef, undef
+  %v4i32 = mul <4 x i32> undef, undef
+  %v5i32 = mul <5 x i32> undef, undef
   ret void
 }
 
 ; ALL-LABEL: 'mul_i64'
 ; THRPTALL: estimated cost of 20 for {{.*}} mul i64
 ; SIZEALL: estimated cost of 12 for {{.*}} mul i64
-define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
-  %vec = load i64, i64 addrspace(1)* %vaddr
-  %mul = mul i64 %vec, %b
-  store i64 %mul, i64 addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'mul_v2i64'
 ; THRPTALL: estimated cost of 40 for {{.*}} mul <2 x i64>
 ; SIZEALL: estimated cost of 24 for {{.*}} mul <2 x i64>
-define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
-  %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
-  %mul = mul <2 x i64> %vec, %b
-  store <2 x i64> %mul, <2 x i64> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'mul_v3i64'
 ; THRPTALL: estimated cost of 60 for {{.*}} mul <3 x i64>
 ; SIZEALL: estimated cost of 36 for {{.*}} mul <3 x i64>
-define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
-  %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
-  %mul = mul <3 x i64> %vec, %b
-  store <3 x i64> %mul, <3 x i64> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'mul_v4i64'
 ; THRPTALL: estimated cost of 80 for {{.*}} mul <4 x i64>
 ; SIZEALL: estimated cost of 48 for {{.*}} mul <4 x i64>
-define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
-  %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
-  %mul = mul <4 x i64> %vec, %b
-  store <4 x i64> %mul, <4 x i64> addrspace(1)* %out
-  ret void
-}
-
-
-; ALL-LABEL: 'mul_v8i64'
 ; THRPTALL: estimated cost of 320 for {{.*}} mul <8 x i64>
 ; SIZEALL: estimated cost of 192 for {{.*}} mul <8 x i64>
-define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
-  %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
-  %mul = mul <8 x i64> %vec, %b
-  store <8 x i64> %mul, <8 x i64> addrspace(1)* %out
+define amdgpu_kernel void @mul_i64() #0 {
+  %i64 = mul i64 undef, undef
+  %v2i64 = mul <2 x i64> undef, undef
+  %v3i64 = mul <3 x i64> undef, undef
+  %v4i64 = mul <4 x i64> undef, undef
+  %v8i64 = mul <8 x i64> undef, undef
   ret void
 }
 
 ; ALL-LABEL: 'mul_i16'
 ; THRPTALL: estimated cost of 4 for {{.*}} mul i16
 ; SIZEALL: estimated cost of 2 for {{.*}} mul i16
-define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
-  %vec = load i16, i16 addrspace(1)* %vaddr
-  %mul = mul i16 %vec, %b
-  store i16 %mul, i16 addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'mul_v2i16'
 ; SLOW16: estimated cost of 8 for {{.*}} mul <2 x i16>
 ; FAST16: estimated cost of 4 for {{.*}} mul <2 x i16>
 ; SIZESLOW16: estimated cost of 4 for {{.*}} mul <2 x i16>
 ; SIZEFAST16: estimated cost of 2 for {{.*}} mul <2 x i16>
-define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
-  %mul = mul <2 x i16> %vec, %b
-  store <2 x i16> %mul, <2 x i16> addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'mul_v3i16'
 ; SLOW16: estimated cost of 16 for {{.*}} mul <3 x i16>
 ; FAST16: estimated cost of 8 for {{.*}} mul <3 x i16>
 ; SIZESLOW16: estimated cost of 8 for {{.*}} mul <3 x i16>
 ; SIZEFAST16: estimated cost of 4 for {{.*}} mul <3 x i16>
-define amdgpu_kernel void @mul_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %vaddr, <3 x i16> %b) #0 {
-  %vec = load <3 x i16>, <3 x i16> addrspace(1)* %vaddr
-  %mul = mul <3 x i16> %vec, %b
-  store <3 x i16> %mul, <3 x i16> addrspace(1)* %out
+define amdgpu_kernel void @mul_i16() #0 {
+  %i16 = mul i16 undef, undef
+  %v2i16 = mul <2 x i16> undef, undef
+  %v3i16 = mul <3 x i16> undef, undef
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/reduce-and.ll b/llvm/test/Analysis/CostModel/AMDGPU/reduce-and.ll
index 07592b1f8d4c4..6357408e89148 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/reduce-and.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/reduce-and.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -cost-model -cost-kind=throughput -analyze | FileCheck %s
+; END.
 
 define i32 @reduce_i1(i32 %arg) {
 ; CHECK-LABEL: 'reduce_i1'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/reduce-or.ll b/llvm/test/Analysis/CostModel/AMDGPU/reduce-or.ll
index c78c115fe6b8c..906ead77c092c 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/reduce-or.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/reduce-or.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -cost-model -cost-kind=throughput -analyze | FileCheck %s
+; END.
 
 define i32 @reduce_i1(i32 %arg) {
 ; CHECK-LABEL: 'reduce_i1'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll
index 42936644e590b..f67a0fae8e127 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll
@@ -2,120 +2,52 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOW64,SLOW16 %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,FAST16 %s
 ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SLOW16 %s
+; END.
 
-; ALL-LABEL: 'shl_i32'
+; ALL-LABEL: 'shl'
 ; ALL: estimated cost of 1 for {{.*}} shl i32
-define amdgpu_kernel void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
-  %vec = load i32, i32 addrspace(1)* %vaddr
-  %or = shl i32 %vec, %b
-  store i32 %or, i32 addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'shl_i64'
 ; FAST64: estimated cost of 2 for {{.*}} shl i64
 ; SLOW64: estimated cost of 4 for {{.*}} shl i64
 ; SIZEALL: estimated cost of 2 for {{.*}} shl i64
-define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
-  %vec = load i64, i64 addrspace(1)* %vaddr
-  %or = shl i64 %vec, %b
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'shl_i16'
 ; ALL: estimated cost of 1 for {{.*}} shl i16
-define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
-  %vec = load i16, i16 addrspace(1)* %vaddr
-  %or = shl i16 %vec, %b
-  store i16 %or, i16 addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'shl_v2i16'
 ; SLOW16: estimated cost of 2 for {{.*}} shl <2 x i16>
 ; FAST16: estimated cost of 1 for {{.*}} shl <2 x i16>
-define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
-  %or = shl <2 x i16> %vec, %b
-  store <2 x i16> %or, <2 x i16> addrspace(1)* %out
+define amdgpu_kernel void @shl() #0 {
+  %i32 = shl i32 undef, undef
+  %i64 = shl i64 undef, undef
+  %i16 = shl i16 undef, undef
+  %v2i16 = shl <2 x i16> undef, undef
   ret void
 }
 
-; ALL-LABEL: 'lshr_i32'
+; ALL-LABEL: 'lshr'
 ; ALL: estimated cost of 1 for {{.*}} lshr i32
-define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
-  %vec = load i32, i32 addrspace(1)* %vaddr
-  %or = lshr i32 %vec, %b
-  store i32 %or, i32 addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'lshr_i64'
 ; FAST64: estimated cost of 2 for {{.*}} lshr i64
 ; SLOW64: estimated cost of 4 for {{.*}} lshr i64
 ; SIZEALL: estimated cost of 2 for {{.*}} lshr i64
-define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
-  %vec = load i64, i64 addrspace(1)* %vaddr
-  %or = lshr i64 %vec, %b
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'lshr_i16'
 ; ALL: estimated cost of 1 for {{.*}} lshr i16
-define amdgpu_kernel void @lshr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
-  %vec = load i16, i16 addrspace(1)* %vaddr
-  %or = lshr i16 %vec, %b
-  store i16 %or, i16 addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'lshr_v2i16'
 ; SLOW16: estimated cost of 2 for {{.*}} lshr <2 x i16>
 ; FAST16: estimated cost of 1 for {{.*}} lshr <2 x i16>
-define amdgpu_kernel void @lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
-  %or = lshr <2 x i16> %vec, %b
-  store <2 x i16> %or, <2 x i16> addrspace(1)* %out
+define amdgpu_kernel void @lshr() #0 {
+  %i32 = lshr i32 undef, undef
+  %i64 = lshr i64 undef, undef
+  %i16 = lshr i16 undef, undef
+  %v2i16 = lshr <2 x i16> undef, undef
   ret void
 }
 
-; ALL-LABEL: 'ashr_i32'
+; ALL-LABEL: 'ashr'
 ; ALL: estimated cost of 1 for {{.*}} ashr i32
-define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
-  %vec = load i32, i32 addrspace(1)* %vaddr
-  %or = ashr i32 %vec, %b
-  store i32 %or, i32 addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'ashr_i64'
 ; FAST64: estimated cost of 2 for {{.*}} ashr i64
 ; SLOW64: estimated cost of 4 for {{.*}} ashr i64
-define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
-  %vec = load i64, i64 addrspace(1)* %vaddr
-  %or = ashr i64 %vec, %b
-  store i64 %or, i64 addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'ashr_i16'
 ; ALL: estimated cost of 1 for {{.*}} ashr i16
-define amdgpu_kernel void @ashr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
-  %vec = load i16, i16 addrspace(1)* %vaddr
-  %or = ashr i16 %vec, %b
-  store i16 %or, i16 addrspace(1)* %out
-  ret void
-}
-
-; ALL-LABEL: 'ashr_v2i16'
 ; SLOW16: estimated cost of 2 for {{.*}} ashr <2 x i16>
 ; FAST16: estimated cost of 1 for {{.*}} ashr <2 x i16>
-define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
-  %or = ashr <2 x i16> %vec, %b
-  store <2 x i16> %or, <2 x i16> addrspace(1)* %out
+define amdgpu_kernel void @ashr() #0 {
+  %i32 = ashr i32 undef, undef
+  %i64 = ashr i64 undef, undef
+  %i16 = ashr i16 undef, undef
+  %v2i16 = ashr <2 x i16> undef, undef
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
index 7fe1cebd879f0..cdbd9ec570abe 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
@@ -5,6 +5,7 @@
 ; RUN: opt < %s -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -cost-kind=code-size -S | FileCheck -check-prefixes=GFX9-CS %s
 ; RUN: opt < %s -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji -S | FileCheck -check-prefixes=VI %s
 ; RUN: opt < %s -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji -cost-kind=code-size -S | FileCheck -check-prefixes=VI-CS %s
+; END.
 
 define amdgpu_kernel void @shufflevector_00_v2i16(<2 x i16> %vec0, <2 x i16> %vec1) {
 ; GFX10-LABEL: 'shufflevector_00_v2i16'