diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
index 3aa682cd2971c..b66e19e8bc563 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
@@ -1144,3 +1144,53 @@ define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %
   %v4f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret void
 }
+
+define void @insert(<16 x i8> %i8v16, <16 x i8> %i8v16_2) {
+; ALL-LABEL: 'insert'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %test0 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %test1 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %test2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %test3 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %test4 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %test5 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %test6 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %test7 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'insert'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %test0 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %test1 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %test2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %test3 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %test4 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %test5 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %test6 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %test7 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %test0 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %test1 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %test2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %test3 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %test4 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %test5 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+  %test6 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
+  %test7 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret void
+}
+
+define void @splice(<16 x i8> %i8v16, <16 x i8> %i8v16_2) {
+; ALL-LABEL: 'splice'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %test0 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %test1 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'splice'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %test0 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %test1 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %test0 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+  %test1 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25>
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
index f71fdbdee527b..c9a013bd58322 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name I --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
@@ -5,9 +6,18 @@
 
 ; FIXME: Should not vectorize on gfx8
 
-; GCN-LABEL: @fadd_combine_v2f16
-; GCN: fadd <2 x half>
 define void @fadd_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fadd_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = fadd <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -24,9 +34,18 @@ bb:
 }
 
 ; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fsub_combine_v2f16
-; GCN: fsub <2 x half>
 define void @fsub_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fsub_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = fsub <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -43,9 +62,18 @@ bb:
 }
 
 ; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fmul_combine_v2f16
-; GCN: fmul <2 x half>
 define void @fmul_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fmul_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = fmul <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -61,9 +89,18 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @fdiv_combine_v2f16
-; GCN: fdiv <2 x half>
 define void @fdiv_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fdiv_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = fdiv <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -79,9 +116,18 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @frem_combine_v2f16
-; GCN: frem <2 x half>
 define void @frem_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @frem_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = frem <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -98,9 +144,18 @@ bb:
 }
 
 ; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fma_combine_v2f16
-; GCN: call <2 x half> @llvm.fma.v2f16
 define amdgpu_kernel void @fma_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define amdgpu_kernel void @fma_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00), <2 x half> splat (half 0xH3C00))
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -117,9 +172,18 @@ bb:
 }
 
 ; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fmuladd_combine_v2f16
-; GCN: call <2 x half> @llvm.fmuladd.v2f16
 define amdgpu_kernel void @fmuladd_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define amdgpu_kernel void @fmuladd_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00), <2 x half> splat (half 0xH3C00))
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -135,12 +199,35 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @minnum_combine_v2f16
-; GFX8: call half @llvm.minnum.f16(
-; GFX8: call half @llvm.minnum.f16(
 
-; GFX9: call <2 x half> @llvm.minnum.v2f16
 define void @minnum_combine_v2f16(ptr addrspace(1) %arg) {
+; GFX8-LABEL: define void @minnum_combine_v2f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT:  [[BB:.*:]]
+; GFX8-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP4:%.*]] = call half @llvm.minnum.f16(half [[ITMP3]], half 0xH3C00)
+; GFX8-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    [[ITMP8:%.*]] = call half @llvm.minnum.f16(half [[ITMP7]], half 0xH3C00)
+; GFX8-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    ret void
+;
+; GFX9-LABEL: define void @minnum_combine_v2f16(
+; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX9-NEXT:  [[BB:.*:]]
+; GFX9-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX9-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX9-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00))
+; GFX9-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -156,12 +243,35 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @maxnum_combine_v2f16
-; GFX8: call half @llvm.maxnum.f16(
-; GFX8: call half @llvm.maxnum.f16(
 
-; GFX9: call <2 x half> @llvm.maxnum.v2f16
 define void @maxnum_combine_v2f16(ptr addrspace(1) %arg) {
+; GFX8-LABEL: define void @maxnum_combine_v2f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT:  [[BB:.*:]]
+; GFX8-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP4:%.*]] = call half @llvm.maxnum.f16(half [[ITMP3]], half 0xH3C00)
+; GFX8-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    [[ITMP8:%.*]] = call half @llvm.maxnum.f16(half [[ITMP7]], half 0xH3C00)
+; GFX8-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    ret void
+;
+; GFX9-LABEL: define void @maxnum_combine_v2f16(
+; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX9-NEXT:  [[BB:.*:]]
+; GFX9-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX9-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX9-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00))
+; GFX9-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -178,10 +288,23 @@ bb:
 }
 
 ; FIXME: Should vectorize
-; GCN-LABEL: @minimum_combine_v2f16
-; GCN: call half @llvm.minimum.f16(
-; GCN: call half @llvm.minimum.f16(
 define void @minimum_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @minimum_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[ITMP4:%.*]] = call half @llvm.minimum.f16(half [[ITMP3]], half 0xH3C00)
+; GCN-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GCN-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GCN-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT:    [[ITMP8:%.*]] = call half @llvm.minimum.f16(half [[ITMP7]], half 0xH3C00)
+; GCN-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -197,10 +320,23 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @maximum_combine_v2f16
-; GCN: call half @llvm.maximum.f16(
-; GCN: call half @llvm.maximum.f16(
 define void @maximum_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @maximum_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[ITMP4:%.*]] = call half @llvm.maximum.f16(half [[ITMP3]], half 0xH3C00)
+; GCN-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GCN-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GCN-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT:    [[ITMP8:%.*]] = call half @llvm.maximum.f16(half [[ITMP7]], half 0xH3C00)
+; GCN-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -216,9 +352,18 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @canonicalize_combine_v2f16
-; GCN: call <2 x half> @llvm.canonicalize.v2f16(
 define void @canonicalize_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @canonicalize_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP0]])
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -234,9 +379,18 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @fabs_combine_v2f16
-; GCN: call <2 x half> @llvm.fabs.v2f16(
 define void @fabs_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fabs_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP0]])
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -252,9 +406,18 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @fneg_combine_v2f16
-; GCN: fneg <2 x half>
 define void @fneg_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fneg_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = fneg <2 x half> [[TMP0]]
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -270,11 +433,36 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @copysign_combine_v2f16
-; GFX8: call half @llvm.copysign.f16(
-; GFX8: call half @llvm.copysign.f16(
-; GFX9: call <2 x half> @llvm.copysign.v2f16(
 define void @copysign_combine_v2f16(ptr addrspace(1) %arg, half %sign) {
+; GFX8-LABEL: define void @copysign_combine_v2f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT:  [[BB:.*:]]
+; GFX8-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP4:%.*]] = call half @llvm.copysign.f16(half [[ITMP3]], half [[SIGN]])
+; GFX8-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    [[ITMP8:%.*]] = call half @llvm.copysign.f16(half [[ITMP7]], half [[SIGN]])
+; GFX8-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    ret void
+;
+; GFX9-LABEL: define void @copysign_combine_v2f16(
+; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
+; GFX9-NEXT:  [[BB:.*:]]
+; GFX9-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX9-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX9-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    [[TMP1:%.*]] = insertelement <2 x half> poison, half [[SIGN]], i32 0
+; GFX9-NEXT:    [[TMP2:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <2 x i32> zeroinitializer
+; GFX9-NEXT:    [[TMP3:%.*]] = call <2 x half> @llvm.copysign.v2f16(<2 x half> [[TMP0]], <2 x half> [[TMP2]])
+; GFX9-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -291,12 +479,59 @@ bb:
 }
 
 ; FIXME: Should always vectorize
-; GCN-LABEL: @copysign_combine_v4f16
-; GFX8: call half @llvm.copysign.f16(
-; GFX8: call half @llvm.copysign.f16(
 
-; GFX9: call <2 x half> @llvm.copysign.v2f16(
 define void @copysign_combine_v4f16(ptr addrspace(1) %arg, half %sign) {
+; GFX8-LABEL: define void @copysign_combine_v4f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT:  [[BB:.*:]]
+; GFX8-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP4:%.*]] = call half @llvm.copysign.f16(half [[ITMP3]], half [[SIGN]])
+; GFX8-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    [[ITMP8:%.*]] = call half @llvm.copysign.f16(half [[ITMP7]], half [[SIGN]])
+; GFX8-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    [[ITMP9:%.*]] = add nuw nsw i64 [[ITMP1]], 2
+; GFX8-NEXT:    [[ITMP10:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP9]]
+; GFX8-NEXT:    [[ITMP11:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    [[ITMP12:%.*]] = call half @llvm.copysign.f16(half [[ITMP11]], half [[SIGN]])
+; GFX8-NEXT:    store half [[ITMP12]], ptr addrspace(1) [[ITMP10]], align 2
+; GFX8-NEXT:    [[ITMP13:%.*]] = add nuw nsw i64 [[ITMP1]], 3
+; GFX8-NEXT:    [[ITMP14:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP13]]
+; GFX8-NEXT:    [[ITMP15:%.*]] = load half, ptr addrspace(1) [[ITMP14]], align 2
+; GFX8-NEXT:    [[ITMP16:%.*]] = call half @llvm.copysign.f16(half [[ITMP15]], half [[SIGN]])
+; GFX8-NEXT:    store half [[ITMP16]], ptr addrspace(1) [[ITMP14]], align 2
+; GFX8-NEXT:    ret void
+;
+; GFX9-LABEL: define void @copysign_combine_v4f16(
+; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
+; GFX9-NEXT:  [[BB:.*:]]
+; GFX9-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX9-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX9-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX9-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX9-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX9-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    [[TMP1:%.*]] = insertelement <2 x half> poison, half [[SIGN]], i32 0
+; GFX9-NEXT:    [[TMP2:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <2 x i32> zeroinitializer
+; GFX9-NEXT:    [[TMP3:%.*]] = call <2 x half> @llvm.copysign.v2f16(<2 x half> [[TMP0]], <2 x half> [[TMP2]])
+; GFX9-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    [[ITMP9:%.*]] = add nuw nsw i64 [[ITMP1]], 2
+; GFX9-NEXT:    [[ITMP10:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP9]]
+; GFX9-NEXT:    [[ITMP11:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX9-NEXT:    [[ITMP13:%.*]] = add nuw nsw i64 [[ITMP1]], 3
+; GFX9-NEXT:    [[ITMP14:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP13]]
+; GFX9-NEXT:    [[ITMP15:%.*]] = load half, ptr addrspace(1) [[ITMP14]], align 2
+; GFX9-NEXT:    [[TMP4:%.*]] = insertelement <2 x half> poison, half [[ITMP11]], i32 0
+; GFX9-NEXT:    [[TMP5:%.*]] = insertelement <2 x half> [[TMP4]], half [[ITMP15]], i32 1
+; GFX9-NEXT:    [[TMP6:%.*]] = call <2 x half> @llvm.copysign.v2f16(<2 x half> [[TMP5]], <2 x half> [[TMP2]])
+; GFX9-NEXT:    store <2 x half> [[TMP6]], ptr addrspace(1) [[ITMP10]], align 2
+; GFX9-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -326,12 +561,54 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @canonicalize_combine_v4f16
-; GFX8: call half @llvm.canonicalize.f16(
-; GFX8: call half @llvm.canonicalize.f16(
 
-; GFX9: call <2 x half> @llvm.canonicalize.v2f16(
 define void @canonicalize_combine_v4f16(ptr addrspace(1) %arg) {
+; GFX8-LABEL: define void @canonicalize_combine_v4f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT:  [[BB:.*:]]
+; GFX8-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP0]])
+; GFX8-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP9:%.*]] = add nuw nsw i64 [[ITMP1]], 2
+; GFX8-NEXT:    [[ITMP10:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP9]]
+; GFX8-NEXT:    [[ITMP11:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    [[ITMP12:%.*]] = call half @llvm.canonicalize.f16(half [[ITMP11]])
+; GFX8-NEXT:    store half [[ITMP12]], ptr addrspace(1) [[ITMP10]], align 2
+; GFX8-NEXT:    [[ITMP13:%.*]] = add nuw nsw i64 [[ITMP1]], 3
+; GFX8-NEXT:    [[ITMP14:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP13]]
+; GFX8-NEXT:    [[ITMP15:%.*]] = load half, ptr addrspace(1) [[ITMP14]], align 2
+; GFX8-NEXT:    [[ITMP16:%.*]] = call half @llvm.canonicalize.f16(half [[ITMP15]])
+; GFX8-NEXT:    store half [[ITMP16]], ptr addrspace(1) [[ITMP14]], align 2
+; GFX8-NEXT:    ret void
+;
+; GFX9-LABEL: define void @canonicalize_combine_v4f16(
+; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX9-NEXT:  [[BB:.*:]]
+; GFX9-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX9-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX9-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX9-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX9-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX9-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP0]])
+; GFX9-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    [[ITMP9:%.*]] = add nuw nsw i64 [[ITMP1]], 2
+; GFX9-NEXT:    [[ITMP10:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP9]]
+; GFX9-NEXT:    [[ITMP11:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX9-NEXT:    [[ITMP13:%.*]] = add nuw nsw i64 [[ITMP1]], 3
+; GFX9-NEXT:    [[ITMP14:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP13]]
+; GFX9-NEXT:    [[ITMP15:%.*]] = load half, ptr addrspace(1) [[ITMP14]], align 2
+; GFX9-NEXT:    [[TMP2:%.*]] = insertelement <2 x half> poison, half [[ITMP11]], i32 0
+; GFX9-NEXT:    [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[ITMP15]], i32 1
+; GFX9-NEXT:    [[TMP4:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP3]])
+; GFX9-NEXT:    store <2 x half> [[TMP4]], ptr addrspace(1) [[ITMP10]], align 2
+; GFX9-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -362,10 +639,18 @@ bb:
 }
 
 ; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @minimumnum_combine_v2f16
-; GFX8: call <2 x half> @llvm.minimumnum.v2f16
-; GFX9: call <2 x half> @llvm.minimumnum.v2f16
 define void @minimumnum_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @minimumnum_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00))
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -382,10 +667,18 @@ bb:
 }
 
 ; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @maximumnum_combine_v2f16
-; GFX8: call <2 x half> @llvm.maximumnum.v2f16
-; GFX9: call <2 x half> @llvm.maximumnum.v2f16
 define void @maximumnum_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @maximumnum_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00))
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64