diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll
index 668946f9eda44d..d6d5275cc5fc98 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll
@@ -1,11 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -instcombine < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
 define <vscale x 16 x i8> @dup_insertelement_0(<vscale x 16 x i8> %v, i8 %s) #0 {
 ; CHECK-LABEL: @dup_insertelement_0(
-; CHECK: %insert = insertelement <vscale x 16 x i8> %v, i8 %s, i64 0
-; CHECK-NEXT: ret <vscale x 16 x i8> %insert
+; CHECK-NEXT:    [[INSERT:%.*]] = insertelement <vscale x 16 x i8> [[V:%.*]], i8 [[S:%.*]], i64 0
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[INSERT]]
+;
   %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 1)
   %insert = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %v, <vscale x 16 x i1> %pg, i8 %s)
   ret <vscale x 16 x i8> %insert
@@ -13,9 +15,10 @@ define <vscale x 16 x i8> @dup_insertelement_0(<vscale x 16 x i8> %v, i8 %s) #0
 
 define <vscale x 16 x i8> @dup_insertelement_1(<vscale x 16 x i8> %v, i8 %s) #0 {
 ; CHECK-LABEL: @dup_insertelement_1(
-; CHECK: %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 2)
-; CHECK-NEXT: %insert = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %v, <vscale x 16 x i1> %pg, i8 %s)
-; CHECK-NEXT: ret <vscale x 16 x i8> %insert
+; CHECK-NEXT:    [[PG:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 2)
+; CHECK-NEXT:    [[INSERT:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> [[V:%.*]], <vscale x 16 x i1> [[PG]], i8 [[S:%.*]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[INSERT]]
+;
   %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 2)
   %insert = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %v, <vscale x 16 x i1> %pg, i8 %s)
   ret <vscale x 16 x i8> %insert
@@ -23,16 +26,18 @@ define <vscale x 16 x i8> @dup_insertelement_1(<vscale x 16 x i8> %v, i8 %s) #0
 
 define <vscale x 16 x i8> @dup_insertelement_x(<vscale x 16 x i8> %v, i8 %s, <vscale x 16 x i1> %pg) #0 {
 ; CHECK-LABEL: @dup_insertelement_x(
-; CHECK: %insert = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %v, <vscale x 16 x i1> %pg, i8 %s)
-; CHECK-NEXT: ret <vscale x 16 x i8> %insert
+; CHECK-NEXT:    [[INSERT:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> [[V:%.*]], <vscale x 16 x i1> [[PG:%.*]], i8 [[S:%.*]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[INSERT]]
+;
   %insert = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %v, <vscale x 16 x i1> %pg, i8 %s)
   ret <vscale x 16 x i8> %insert
 }
 
 define <vscale x 8 x i16> @dup_insertelement_0_convert(<vscale x 8 x i16> %v, i16 %s) #0 {
 ; CHECK-LABEL: @dup_insertelement_0_convert(
-; CHECK: %insert = insertelement <vscale x 8 x i16> %v, i16 %s, i64 0
-; CHECK-NEXT: ret <vscale x 8 x i16> %insert
+; CHECK-NEXT:    [[INSERT:%.*]] = insertelement <vscale x 8 x i16> [[V:%.*]], i16 [[S:%.*]], i64 0
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[INSERT]]
+;
   %pg = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 1)
   %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg)
   %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
@@ -42,9 +47,10 @@ define <vscale x 8 x i16> @dup_insertelement_0_convert(<vscale x 8 x i16> %v, i1
 
 define <vscale x 8 x i16> @dupx_splat_convert(i16 %s) #0 {
 ; CHECK-LABEL: @dupx_splat_convert(
-; CHECK-NEXT: %.splatinsert = insertelement <vscale x 8 x i16> poison, i16 %s, i32 0
-; CHECK-NEXT: %splat = shufflevector <vscale x 8 x i16> %.splatinsert, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT: ret <vscale x 8 x i16> %splat
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[S:%.*]], i32 0
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[SPLAT]]
+;
   %splat = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 %s)
   ret <vscale x 8 x i16> %splat
 }
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-tbl-dupx.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-tbl-dupx.ll
index e46879fe645e88..1be1a680ff5331 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-tbl-dupx.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-tbl-dupx.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -instcombine < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -5,11 +6,12 @@ target triple = "aarch64-unknown-linux-gnu"
 ; op2 = tbl(op1 dup_x(idx)) -> op2 = vector_splat(extractelement(op1, idx))
 
 define <vscale x 16 x i8> @dup_ext_i8(<vscale x 16 x i8> %data) #0 {
+;
 ; CHECK-LABEL: @dup_ext_i8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <vscale x 16 x i8> [[DATA:%.*]], i8 1
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP1]], i32 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
-; CHECK-NEXT:    ret <vscale x 16 x i8> [[DOTSPLAT]]
+; CHECK-NEXT:    [[OUT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[OUT]]
 ;
   %tmp = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.x.nxv16i8(i8 1)
   %out = call <vscale x 16 x i8> @llvm.aarch64.sve.tbl.nxv16i8(<vscale x 16 x i8>  %data, <vscale x 16 x i8> %tmp)
@@ -17,11 +19,12 @@ define <vscale x 16 x i8> @dup_ext_i8(<vscale x 16 x i8> %data) #0 {
 }
 
 define <vscale x 8 x i16> @dup_ext_i16(<vscale x 8 x i16> %data) #0 {
+;
 ; CHECK-LABEL: @dup_ext_i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <vscale x 8 x i16> [[DATA:%.*]], i16 1
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP1]], i32 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <vscale x 8 x i16> [[DOTSPLAT]]
+; CHECK-NEXT:    [[OUT:%.*]] = shufflevector <vscale x 8 x i16> [[DOTSPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[OUT]]
 ;
   %tmp = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 1)
   %out = call <vscale x 8 x i16> @llvm.aarch64.sve.tbl.nxv8i16(<vscale x 8 x i16>  %data, <vscale x 8 x i16> %tmp)
@@ -29,11 +32,12 @@ define <vscale x 8 x i16> @dup_ext_i16(<vscale x 8 x i16> %data) #0 {
 }
 
 define <vscale x 4 x i32> @dup_ext_i32(<vscale x 4 x i32> %data) #0 {
+;
 ; CHECK-LABEL: @dup_ext_i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <vscale x 4 x i32> [[DATA:%.*]], i32 1
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[DOTSPLAT]]
+; CHECK-NEXT:    [[OUT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[OUT]]
 ;
   %tmp = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
   %out = call <vscale x 4 x i32> @llvm.aarch64.sve.tbl.nxv4i32(<vscale x 4 x i32>  %data, <vscale x 4 x i32> %tmp)
@@ -41,11 +45,12 @@ define <vscale x 4 x i32> @dup_ext_i32(<vscale x 4 x i32> %data) #0 {
 }
 
 define <vscale x 2 x i64> @dup_ext_i64(<vscale x 2 x i64> %data) #0 {
+;
 ; CHECK-LABEL: @dup_ext_i64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <vscale x 2 x i64> [[DATA:%.*]], i64 1
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP1]], i32 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    ret <vscale x 2 x i64> [[DOTSPLAT]]
+; CHECK-NEXT:    [[OUT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    ret <vscale x 2 x i64> [[OUT]]
 ;
   %tmp = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
   %out = call <vscale x 2 x i64> @llvm.aarch64.sve.tbl.nxv2i64(<vscale x 2 x i64>  %data, <vscale x 2 x i64> %tmp)
@@ -53,11 +58,12 @@ define <vscale x 2 x i64> @dup_ext_i64(<vscale x 2 x i64> %data) #0 {
 }
 
 define <vscale x 8 x half> @dup_ext_f16(<vscale x 8 x half> %data) #0 {
+;
 ; CHECK-LABEL: @dup_ext_f16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <vscale x 8 x half> [[DATA:%.*]], i16 1
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x half> poison, half [[TMP1]], i32 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x half> [[DOTSPLATINSERT]], <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT:    ret <vscale x 8 x half> [[DOTSPLAT]]
+; CHECK-NEXT:    [[OUT:%.*]] = shufflevector <vscale x 8 x half> [[DOTSPLATINSERT]], <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <vscale x 8 x half> [[OUT]]
 ;
   %tmp = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.x.nxv8i16(i16 1)
   %out = call <vscale x 8 x half> @llvm.aarch64.sve.tbl.nxv8f16(<vscale x 8 x half>  %data, <vscale x 8 x i16> %tmp)
@@ -65,11 +71,12 @@ define <vscale x 8 x half> @dup_ext_f16(<vscale x 8 x half> %data) #0 {
 }
 
 define <vscale x 4 x float> @dup_ext_f32(<vscale x 4 x float> %data) #0 {
+;
 ; CHECK-LABEL: @dup_ext_f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <vscale x 4 x float> [[DATA:%.*]], i32 1
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x float> [[DOTSPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    ret <vscale x 4 x float> [[DOTSPLAT]]
+; CHECK-NEXT:    [[OUT:%.*]] = shufflevector <vscale x 4 x float> [[DOTSPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <vscale x 4 x float> [[OUT]]
 ;
   %tmp = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
   %out = call <vscale x 4 x float> @llvm.aarch64.sve.tbl.nxv4f32(<vscale x 4 x float>  %data, <vscale x 4 x i32> %tmp)
@@ -77,11 +84,12 @@ define <vscale x 4 x float> @dup_ext_f32(<vscale x 4 x float> %data) #0 {
 }
 
 define <vscale x 2 x double> @dup_ext_f64(<vscale x 2 x double> %data) #0 {
+;
 ; CHECK-LABEL: @dup_ext_f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <vscale x 2 x double> [[DATA:%.*]], i64 1
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP1]], i32 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x double> [[DOTSPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    ret <vscale x 2 x double> [[DOTSPLAT]]
+; CHECK-NEXT:    [[OUT:%.*]] = shufflevector <vscale x 2 x double> [[DOTSPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    ret <vscale x 2 x double> [[OUT]]
 ;
   %tmp = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.x.nxv2i64(i64 1)
   %out = call <vscale x 2 x double> @llvm.aarch64.sve.tbl.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x i64> %tmp)
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
index e2a8b27d21d71d..b54222cfecddcf 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
@@ -1,165 +1,183 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck %s
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.buffer.load
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @buffer_load_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @buffer_load_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   ret float %data
 }
 
-; CHECK-LABEL: @buffer_load_v1f32(
-; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret <1 x float> %data
 define amdgpu_ps <1 x float> @buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @buffer_load_v1f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret <1 x float> [[DATA]]
+;
   %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   ret <1 x float> %data
 }
 
-; CHECK-LABEL: @buffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   ret <2 x float> %data
 }
 
-; CHECK-LABEL: @buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret <4 x float> %data
 define amdgpu_ps <4 x float> @buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret <4 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   ret <4 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_buffer_load_v2f32(
-; CHECK: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_buffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt1 = extractelement <2 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_buffer_load_v4f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt1 = extractelement <4 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt2_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt1 = extractelement <4 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt3_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt1 = extractelement <4 x float> %data, i32 3
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %elt0 = extractelement <2 x float> %data, i32 0
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: %ins0 = insertvalue { float, float } undef, float %elt0, 0
-; CHECK-NEXT: %ins1 = insertvalue { float, float } %ins0, float %elt1, 1
-; CHECK-NEXT: ret { float, float } %ins1
 define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x float> [[DATA]], i32 0
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    [[INS0:%.*]] = insertvalue { float, float } undef, float [[ELT0]], 0
+; CHECK-NEXT:    [[INS1:%.*]] = insertvalue { float, float } [[INS0]], float [[ELT1]], 1
+; CHECK-NEXT:    ret { float, float } [[INS1]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %data, i32 0
   %elt1 = extractelement <4 x float> %data, i32 1
@@ -168,16 +186,17 @@ define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i3
   ret { float, float } %ins1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_2(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %elt0 = extractelement <3 x float> %data, i32 0
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 1
-; CHECK-NEXT: %elt2 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: %ins0 = insertvalue { float, float, float } undef, float %elt0, 0
-; CHECK-NEXT: %ins1 = insertvalue { float, float, float } %ins0, float %elt1, 1
-; CHECK-NEXT: %ins2 = insertvalue { float, float, float } %ins1, float %elt2, 2
-; CHECK-NEXT: ret { float, float, float } %ins2
 define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_2(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <3 x float> [[DATA]], i32 0
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 1
+; CHECK-NEXT:    [[ELT2:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    [[INS0:%.*]] = insertvalue { float, float, float } undef, float [[ELT0]], 0
+; CHECK-NEXT:    [[INS1:%.*]] = insertvalue { float, float, float } [[INS0]], float [[ELT1]], 1
+; CHECK-NEXT:    [[INS2:%.*]] = insertvalue { float, float, float } [[INS1]], float [[ELT2]], 2
+; CHECK-NEXT:    ret { float, float, float } [[INS2]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %data, i32 0
   %elt1 = extractelement <4 x float> %data, i32 1
@@ -188,12 +207,14 @@ define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f
   ret { float, float, float } %ins2
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_3(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %ins1 = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 undef, i32 1>
-; CHECK-NEXT: %ret = fadd <2 x float> %ins1, %shuf
 define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_3(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 undef, i32 1>
+; CHECK-NEXT:    [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]]
+; CHECK-NEXT:    ret <2 x float> [[RET]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %data, i32 0
   %elt2 = extractelement <4 x float> %data, i32 2
@@ -204,13 +225,14 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(<4 x i3
   ret <2 x float> %ret
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_4(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %ins1 = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT: %ret = fadd <2 x float> %ins1, %shuf
-; CHECK-NEXT: ret <2 x float> %ret
 define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_4(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]]
+; CHECK-NEXT:    ret <2 x float> [[RET]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %data, i32 0
   %elt2 = extractelement <4 x float> %data, i32 2
@@ -221,12 +243,14 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(<4 x i3
   ret <2 x float> %ret
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_5(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %ins1 = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 2, i32 2>
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: %ret = fadd <2 x float> %ins1, %shuf
 define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_5(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> undef, <2 x i32> <i32 2, i32 2>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]]
+; CHECK-NEXT:    ret <2 x float> [[RET]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt2 = extractelement <4 x float> %data, i32 2
   %ins0 = insertelement <2 x float> poison, float %elt2, i32 0
@@ -236,58 +260,64 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(<4 x i3
   ret <2 x float> %ret
 }
 
-; CHECK-LABEL: @extract_elt0_buffer_load_v3f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt0 = extractelement <3 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_buffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt1 = extractelement <3 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_buffer_load_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt2_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt1 = extractelement <3 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @preserve_metadata_extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false), !fpmath !0
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
@@ -303,76 +333,83 @@ declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1)
 ; llvm.amdgcn.buffer.load.format
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @buffer_load_format_v1f32(
-; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true)
-; CHECK-NEXT: ret <1 x float> %data
 define amdgpu_ps <1 x float> @buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @buffer_load_format_v1f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 true)
+; CHECK-NEXT:    ret <1 x float> [[DATA]]
+;
   %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true)
   ret <1 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 true, i1 false)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
 ; The initial insertion point is at the extractelement
-; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32(
-; CHECK-NEXT: %tmp = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
-; CHECK-NEXT: %1 = shufflevector <2 x float> %tmp, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT: %tmp1 = bitcast <4 x float> %1 to <2 x double>
-; CHECK-NEXT: %tmp2 = extractelement <2 x double> %tmp1, i32 0
-; CHECK-NEXT: ret double %tmp2
 define double @extract01_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
-  %tmp1 = bitcast <4 x float> %tmp to <2 x double>
-  %tmp2 = extractelement <2 x double> %tmp1, i32 0
-  ret double %tmp2
+; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[VAR]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[VAR1:%.*]] = bitcast <4 x float> [[TMP1]] to <2 x double>
+; CHECK-NEXT:    [[VAR2:%.*]] = extractelement <2 x double> [[VAR1]], i32 0
+; CHECK-NEXT:    ret double [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
+  %var1 = bitcast <4 x float> %var to <2 x double>
+  %var2 = extractelement <2 x double> %var1, i32 0
+  ret double %var2
 }
 
-; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
-; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
-; CHECK-NEXT: ret i32 %tmp2
 define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
-  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
-  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
-  ret i32 %tmp2
+; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    ret i32 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
+  %var1 = bitcast <4 x float> %var to <4 x i32>
+  %var2 = extractelement <4 x i32> %var1, i32 0
+  ret i32 %var2
 }
 
-; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
-; CHECK-NEXT: %1 = bitcast float %tmp to i32
-; CHECK-NEXT: %tmp2 = trunc i32 %1 to i16
-; CHECK-NEXT: ret i16 %tmp2
 define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
-  %tmp1 = bitcast <4 x float> %tmp to <8 x i16>
-  %tmp2 = extractelement <8 x i16> %tmp1, i32 0
-  ret i16 %tmp2
+; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    [[VAR2:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT:    ret i16 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
+  %var1 = bitcast <4 x float> %var to <8 x i16>
+  %var2 = extractelement <8 x i16> %var1, i32 0
+  ret i16 %var2
 }
 
 declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1
@@ -385,228 +422,252 @@ declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i
 ; llvm.amdgcn.raw.buffer.load
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @raw_buffer_load_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @raw_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @raw_buffer_load_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   ret float %data
 }
 
-; CHECK-LABEL: @raw_buffer_load_v1f32(
-; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <1 x float> %data
 define amdgpu_ps <1 x float> @raw_buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @raw_buffer_load_v1f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <1 x float> [[DATA]]
+;
   %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   ret <1 x float> %data
 }
 
-; CHECK-LABEL: @raw_buffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @raw_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   ret <2 x float> %data
 }
 
-; CHECK-LABEL: @raw_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <4 x float> %data
 define amdgpu_ps <4 x float> @raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <4 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   ret <4 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f32(
-; CHECK: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <2 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_raw_buffer_load_v4f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_raw_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 8
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 12
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 3
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 8
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_raw_buffer_load_v3f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <3 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_raw_buffer_load_v3f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 8
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_raw_buffer_load_v3f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v3f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v3f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
-; CHECK-NEXT: ret i32 %tmp2
 define i32 @extract0_bitcast_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
-  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
-  ret i32 %tmp2
+; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    ret i32 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %var1 = bitcast <4 x float> %var to <4 x i32>
+  %var2 = extractelement <4 x i32> %var1, i32 0
+  ret i32 %var2
 }
 
-; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4i32(
-; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float
-; CHECK-NEXT: ret float %tmp2
 define float @extract0_bitcast_raw_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
-  %tmp = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %tmp1 = bitcast <4 x i32> %tmp to <4 x float>
-  %tmp2 = extractelement <4 x float> %tmp1, i32 0
-  ret float %tmp2
+; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4i32(
+; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast i32 [[VAR]] to float
+; CHECK-NEXT:    ret float [[VAR2]]
+;
+  %var = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %var1 = bitcast <4 x i32> %var to <4 x float>
+  %var2 = extractelement <4 x float> %var1, i32 0
+  ret float %var2
 }
 
-; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
@@ -620,59 +681,65 @@ declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32)
 
 declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) #1
 
-; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f16(
-; CHECK: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt0_raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <2 x half> %data, i32 0
   ret half %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 2
-; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <2 x half> %data, i32 1
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 2
-; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_raw_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <3 x half> %data, i32 1
   ret half %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 2
-; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 1
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 6
-; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt3_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 3
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f16(
-; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x half>
 define amdgpu_ps <2 x half> @extract_elt0_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x half> [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x half> %shuf
@@ -683,59 +750,65 @@ declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32>, i32, i32, i32)
 declare <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32>, i32, i32, i32) #1
 declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32>, i32, i32, i32) #1
 
-; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2i8(
-; CHECK: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt0_raw_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2i8(
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <2 x i8> %data, i32 0
   ret i8 %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 1
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <2 x i8> %data, i32 1
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 1
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <3 x i8> %data, i32 1
   ret i8 %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 1
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x i8> %data, i32 1
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 3
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt3_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x i8> %data, i32 3
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4i8(
-; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x i8>
 define amdgpu_ps <2 x i8> @extract_elt0_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4i8(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x i8> [[DATA]]
+;
   %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x i8> %shuf
@@ -750,179 +823,198 @@ declare <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32>, i32, i32, i32) #1
 ; llvm.amdgcn.s.buffer.load
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @s_buffer_load_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @s_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @s_buffer_load_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   ret float %data
 }
 
-; CHECK-LABEL: @s_buffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @s_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   ret <2 x float> %data
 }
 
-; CHECK-LABEL: @s_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret <4 x float> %data
 define amdgpu_ps <4 x float> @s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @s_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <4 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   ret <4 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f32(
-; CHECK: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <2 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_s_buffer_load_v4f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_s_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 8
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt2_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 12
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 3
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt2_elt3_s_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 8
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_s_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt2_elt3_s_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt2_elt3_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_s_buffer_load_v3f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_s_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt0 = extractelement <3 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_s_buffer_load_v3f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 8
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt2_s_buffer_load_v3f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v3f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v3f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
@@ -930,42 +1022,46 @@ define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v3f32(<4 x i32> in
 
 ; Do not trim to vec3 s_buffer_load in instcombine, as the load will most likely be widened
 ; to vec4 anyway during lowering.
-; CHECK-LABEL: @extract_elt1_elt2_elt3_s_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
-; CHECK-NEXT: ret i32 %tmp2
 define i32 @extract0_bitcast_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
-  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
-  ret i32 %tmp2
+; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    ret i32 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
+  %var1 = bitcast <4 x float> %var to <4 x i32>
+  %var2 = extractelement <4 x i32> %var1, i32 0
+  ret i32 %var2
 }
 
-; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4i32(
-; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float
-; CHECK-NEXT: ret float %tmp2
 define float @extract0_bitcast_s_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
-  %tmp = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-  %tmp1 = bitcast <4 x i32> %tmp to <4 x float>
-  %tmp2 = extractelement <4 x float> %tmp1, i32 0
-  ret float %tmp2
+; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4i32(
+; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast i32 [[VAR]] to float
+; CHECK-NEXT:    ret float [[VAR2]]
+;
+  %var = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 0)
+  %var1 = bitcast <4 x i32> %var to <4 x float>
+  %var2 = extractelement <4 x float> %var1, i32 0
+  ret float %var2
 }
 
-; CHECK-LABEL: @preserve_metadata_extract_elt0_s_buffer_load_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0), !fpmath !0
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @preserve_metadata_extract_elt0_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @preserve_metadata_extract_elt0_s_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0), !fpmath !0
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0), !fpmath !0
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
@@ -977,60 +1073,66 @@ declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32) #1
 declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) #1
 
-; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f16(
-; CHECK: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt0_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt0 = extractelement <2 x half> %data, i32 0
   ret half %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 2
-; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <2 x half> %data, i32 1
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 2
-; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_s_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <3 x half> %data, i32 1
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 2
-; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 1
   ret half %elt1
 }
 
 
-; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 6
-; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt3_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 3
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f16(
-; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret <2 x half>
 define amdgpu_ps <2 x half> @extract_elt0_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x half> [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x half> %shuf
@@ -1041,59 +1143,65 @@ declare <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32>, i32, i32) #1
 declare <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32>, i32, i32) #1
 declare <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32>, i32, i32) #1
 
-; CHECK-LABEL: @extract_elt0_s_buffer_load_v2i8(
-; CHECK: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt0_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_s_buffer_load_v2i8(
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt0 = extractelement <2 x i8> %data, i32 0
   ret i8 %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_s_buffer_load_v2i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 1
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt1_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_s_buffer_load_v2i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <2 x i8> %data, i32 1
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_s_buffer_load_v3i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 1
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt1_s_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_s_buffer_load_v3i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <3 x i8> %data, i32 1
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_s_buffer_load_v4i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 1
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_s_buffer_load_v4i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <4 x i8> %data, i32 1
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_s_buffer_load_v4i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 3
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt3_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt3_s_buffer_load_v4i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <4 x i8> %data, i32 3
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4i8(
-; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret <2 x i8>
 define amdgpu_ps <2 x i8> @extract_elt0_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4i8(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x i8> [[DATA]]
+;
   %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x i8> %shuf
@@ -1108,226 +1216,250 @@ declare <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32>, i32, i32) #1
 ; llvm.amdgcn.raw.buffer.load.format
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @raw_buffer_load_format_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @raw_buffer_load_format_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @raw_buffer_load_format_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   ret float %data
 }
 
-; CHECK-LABEL: @raw_buffer_load_format_v1f32(
-; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <1 x float> %data
 define amdgpu_ps <1 x float> @raw_buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @raw_buffer_load_format_v1f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <1 x float> [[DATA]]
+;
   %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   ret <1 x float> %data
 }
 
-; CHECK-LABEL: @raw_buffer_load_format_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @raw_buffer_load_format_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   ret <2 x float> %data
 }
 
-; CHECK-LABEL: @raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <4 x float> %data
 define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <4 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   ret <4 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v2f32(
-; CHECK: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <2 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt3_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 3
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <3 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
-; CHECK-NEXT: ret i32 %tmp2
 define i32 @extract0_bitcast_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
-  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
-  ret i32 %tmp2
+; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    ret i32 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %var1 = bitcast <4 x float> %var to <4 x i32>
+  %var2 = extractelement <4 x i32> %var1, i32 0
+  ret i32 %var2
 }
 
-; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4i32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %tmp
 define float @extract0_bitcast_raw_buffer_load_format_v4i32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %tmp1 = extractelement <4 x float> %tmp, i32 0
-  ret float %tmp1
+; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4i32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[VAR]]
+;
+  %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %var1 = extractelement <4 x float> %var, i32 0
+  ret float %var1
 }
 
-; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
@@ -1343,228 +1475,252 @@ declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i3
 ; llvm.amdgcn.struct.buffer.load
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @struct_buffer_load_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @struct_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @struct_buffer_load_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   ret float %data
 }
 
-; CHECK-LABEL: @struct_buffer_load_v1f32(
-; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <1 x float> %data
 define amdgpu_ps <1 x float> @struct_buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @struct_buffer_load_v1f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <1 x float> [[DATA]]
+;
   %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   ret <1 x float> %data
 }
 
-; CHECK-LABEL: @struct_buffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @struct_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   ret <2 x float> %data
 }
 
-; CHECK-LABEL: @struct_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <4 x float> %data
 define amdgpu_ps <4 x float> @struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <4 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   ret <4 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f32(
-; CHECK: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <2 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_struct_buffer_load_v4f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_struct_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 8
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 12
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 3
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 8
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_struct_buffer_load_v3f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <3 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_struct_buffer_load_v3f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 8
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_struct_buffer_load_v3f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v3f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v3f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
-; CHECK-NEXT: ret i32 %tmp2
 define i32 @extract0_bitcast_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
-  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
-  ret i32 %tmp2
+; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    ret i32 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %var1 = bitcast <4 x float> %var to <4 x i32>
+  %var2 = extractelement <4 x i32> %var1, i32 0
+  ret i32 %var2
 }
 
-; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4i32(
-; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float
-; CHECK-NEXT: ret float %tmp2
 define float @extract0_bitcast_struct_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
-  %tmp = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %tmp1 = bitcast <4 x i32> %tmp to <4 x float>
-  %tmp2 = extractelement <4 x float> %tmp1, i32 0
-  ret float %tmp2
+; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4i32(
+; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast i32 [[VAR]] to float
+; CHECK-NEXT:    ret float [[VAR2]]
+;
+  %var = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %var1 = bitcast <4 x i32> %var to <4 x float>
+  %var2 = extractelement <4 x float> %var1, i32 0
+  ret float %var2
 }
 
-; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @preserve_metadata_extract_elt0_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
@@ -1578,59 +1734,65 @@ declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i
 
 declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1
 
-; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f16(
-; CHECK: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt0_struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <2 x half> %data, i32 0
   ret half %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 2
-; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <2 x half> %data, i32 1
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 2
-; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_struct_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x half> %data, i32 1
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 2
-; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 1
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 6
-; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt3_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 3
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f16(
-; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x half>
 define amdgpu_ps <2 x half> @extract_elt0_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x half> [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x half> %shuf
@@ -1641,59 +1803,65 @@ declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32>, i32, i32, i3
 declare <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32>, i32, i32, i32, i32) #1
 declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) #1
 
-; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2i8(
-; CHECK: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt0_struct_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2i8(
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <2 x i8> %data, i32 0
   ret i8 %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 1
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <2 x i8> %data, i32 1
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 1
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x i8> %data, i32 1
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 1
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x i8> %data, i32 1
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 3
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt3_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x i8> %data, i32 3
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4i8(
-; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x i8>
 define amdgpu_ps <2 x i8> @extract_elt0_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4i8(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x i8> [[DATA]]
+;
   %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x i8> %shuf
@@ -1708,217 +1876,240 @@ declare <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32>, i32, i32, i32,
 ; llvm.amdgcn.struct.buffer.load.format
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @struct_buffer_load_format_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @struct_buffer_load_format_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @struct_buffer_load_format_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   ret float %data
 }
 
-; CHECK-LABEL: @struct_buffer_load_format_v1f32(
-; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <1 x float> %data
 define amdgpu_ps <1 x float> @struct_buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @struct_buffer_load_format_v1f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <1 x float> [[DATA]]
+;
   %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   ret <1 x float> %data
 }
 
-; CHECK-LABEL: @struct_buffer_load_format_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @struct_buffer_load_format_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   ret <2 x float> %data
 }
 
-; CHECK-LABEL: @struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <4 x float> %data
 define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <4 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   ret <4 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v2f32(
-; CHECK: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <2 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt3_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 3
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <3 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
-; CHECK-NEXT: ret i32 %tmp2
 define i32 @extract0_bitcast_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
-  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
-  ret i32 %tmp2
+; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    ret i32 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %var1 = bitcast <4 x float> %var to <4 x i32>
+  %var2 = extractelement <4 x i32> %var1, i32 0
+  ret i32 %var2
 }
 
-; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
@@ -1936,209 +2127,231 @@ declare <4 x i32> @llvm.amdgcn.struct.buffer.load.format.v4i32(<4 x i32>, i32, i
 ; llvm.amdgcn.raw.tbuffer.load
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @raw_tbuffer_load_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @raw_tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @raw_tbuffer_load_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   ret float %data
 }
 
-; CHECK-LABEL: @raw_tbuffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @raw_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   ret <2 x float> %data
 }
 
-; CHECK-LABEL: @raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret <4 x float> %data
 define amdgpu_ps <4 x float> @raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <4 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   ret <4 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v2f32(
-; CHECK: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <2 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 3
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt2_elt3_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt0 = extractelement <3 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract0_bitcast_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
-; CHECK-NEXT: ret i32 %tmp2
 define i32 @extract0_bitcast_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
-  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
-  ret i32 %tmp2
+; CHECK-LABEL: @extract0_bitcast_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    ret i32 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
+  %var1 = bitcast <4 x float> %var to <4 x i32>
+  %var2 = extractelement <4 x i32> %var1, i32 0
+  ret i32 %var2
 }
 
-; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_tbuffer_load_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0), !fpmath !0
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @preserve_metadata_extract_elt0_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0), !fpmath !0
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0), !fpmath !0
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
@@ -2151,40 +2364,44 @@ declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32
 
 declare <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1
 
-; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f16(
-; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <4 x half> %data, i32 3
-; CHECK-NEXT: ret half %elt1
 define amdgpu_ps half @extract_elt3_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <4 x half> [[DATA]], i32 3
+; CHECK-NEXT:    ret half [[ELT1]]
+;
   %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 3
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f16(
-; CHECK-NEXT: %data = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <3 x half> %data, i32 2
-; CHECK-NEXT: ret half %elt1
 define amdgpu_ps half @extract_elt2_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x half> [[DATA]], i32 2
+; CHECK-NEXT:    ret half [[ELT1]]
+;
   %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 2
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f16(
-; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x half> %data, i32 1
-; CHECK-NEXT: ret half %elt1
 define amdgpu_ps half @extract_elt1_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x half> [[DATA]], i32 1
+; CHECK-NEXT:    ret half [[ELT1]]
+;
   %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 1
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f16(
-; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt0_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 0
   ret half %elt1
@@ -2199,209 +2416,231 @@ declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32,
 ; llvm.amdgcn.struct.tbuffer.load
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @struct_tbuffer_load_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @struct_tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @struct_tbuffer_load_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   ret float %data
 }
 
-; CHECK-LABEL: @struct_tbuffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @struct_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   ret <2 x float> %data
 }
 
-; CHECK-LABEL: @struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: ret <4 x float> %data
 define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <4 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   ret <4 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v2f32(
-; CHECK: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %elt1 = extractelement <2 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt3_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 3
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt2_elt3_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %elt0 = extractelement <3 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract0_bitcast_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
-; CHECK-NEXT: ret i32 %tmp2
 define i32 @extract0_bitcast_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
-  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
-  ret i32 %tmp2
+; CHECK-LABEL: @extract0_bitcast_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    ret i32 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
+  %var1 = bitcast <4 x float> %var to <4 x i32>
+  %var2 = extractelement <4 x i32> %var1, i32 0
+  ret i32 %var2
 }
 
-; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_tbuffer_load_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0), !fpmath !0
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @preserve_metadata_extract_elt0_struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0), !fpmath !0
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0), !fpmath !0
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
@@ -2418,209 +2657,231 @@ declare <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32>, i32, i32, i3
 ; llvm.amdgcn.tbuffer.load
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @tbuffer_load_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @tbuffer_load_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   ret float %data
 }
 
-; CHECK-LABEL: @tbuffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   ret <2 x float> %data
 }
 
-; CHECK-LABEL: @tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <4 x float> %data
 define amdgpu_ps <4 x float> @tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    ret <4 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   ret <4 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_tbuffer_load_v2f32(
-; CHECK: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_tbuffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %elt1 = extractelement <2 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %elt1 = extractelement <4 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt2_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %elt1 = extractelement <4 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt3_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %elt1 = extractelement <4 x float> %data, i32 3
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt2_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt2_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_elt2_elt3_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %elt0 = extractelement <3 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %elt1 = extractelement <3 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt2_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %elt1 = extractelement <3 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract0_bitcast_tbuffer_load_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
-; CHECK-NEXT: ret i32 %tmp2
 define i32 @extract0_bitcast_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
-  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
-  ret i32 %tmp2
+; CHECK-LABEL: @extract0_bitcast_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    ret i32 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+  %var1 = bitcast <4 x float> %var to <4 x i32>
+  %var2 = extractelement <4 x i32> %var1, i32 0
+  ret i32 %var2
 }
 
-; CHECK-LABEL: @preserve_metadata_extract_elt0_tbuffer_load_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @preserve_metadata_extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @preserve_metadata_extract_elt0_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
@@ -2638,20 +2899,24 @@ declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32,
 ; llvm.amdgcn.image.sample
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 ; Check that the intrinsic remains unchanged in the presence of TFE or LWE
-; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_tfe(
-; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0)
-; CHECK: ret float %elt0
 define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_tfe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_tfe(
+; CHECK-NEXT:    [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 1, i32 0)
+; CHECK-NEXT:    [[DATA_VEC:%.*]] = extractvalue { <4 x float>, i32 } [[DATA]], 0
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA_VEC]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0)
   %data.vec = extractvalue {<4 x float>,i32} %data, 0
   %elt0 = extractelement <4 x float> %data.vec, i32 0
@@ -2659,176 +2924,197 @@ define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_tfe(float %vaddr,
 }
 
 ; Check that the intrinsic remains unchanged in the presence of TFE or LWE
-; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_lwe(
-; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0)
-; CHECK: ret float %elt0
 define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_lwe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_lwe(
+; CHECK-NEXT:    [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 2, i32 0)
+; CHECK-NEXT:    [[DATA_VEC:%.*]] = extractvalue { <4 x float>, i32 } [[DATA]], 0
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA_VEC]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0)
   %data.vec = extractvalue {<4 x float>,i32} %data, 0
   %elt0 = extractelement <4 x float> %data.vec, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_image_sample_2d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_2d_v4f32_f32(float %s, float %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32(
-; CHECK-NEXT: ret float undef
 define amdgpu_ps float @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32(float %s, float %t, float %r, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32(
+; CHECK-NEXT:    ret float undef
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 0, float %s, float %t, float %r, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_dmask_0001_image_sample_1darray_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1darray.f32.f32(i32 1, float %s, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_dmask_0001_image_sample_1darray_v4f32_f32(float %s, float %slice, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_dmask_0001_image_sample_1darray_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1darray.f32.f32(i32 1, float [[S:%.*]], float [[SLICE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 1, float %s, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_dmask_0010_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 2, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_dmask_0010_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_dmask_0010_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 2, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 2, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_dmask_0100_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 4, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_dmask_0100_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_dmask_0100_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 4, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 4, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_dmask_1000_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 8, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_dmask_1000_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_dmask_1000_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 8, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 8, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_dmask_1001_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_dmask_1001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_dmask_1001_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 9, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_dmask_0011_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_dmask_0011_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: %1 = insertelement <2 x float> undef, float %data, i32 0
-; CHECK-NEXT: ret <2 x float> %1
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[DATA]], i32 0
+; CHECK-NEXT:    ret <2 x float> [[TMP1]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: %1 = insertelement <3 x float> undef, float %data, i32 0
-; CHECK-NEXT: ret <3 x float> %1
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> undef, float [[DATA]], i32 0
+; CHECK-NEXT:    ret <3 x float> [[TMP1]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <2 x float> %data, <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 undef>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x float> [[DATA]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 undef>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <2 x float> %data, <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 undef>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x float> [[DATA]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 undef>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
@@ -2844,10 +3130,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, floa
 ; llvm.amdgcn.image.sample.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt1_image_sample_cl_2darray_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cl.2darray.f32.f32(i32 2, float %s, float %t, float %slice, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_image_sample_cl_2darray_v4f32_f32(float %s, float %t, float %slice, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt1_image_sample_cl_2darray_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cl.2darray.f32.f32(i32 2, float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32(i32 15, float %s, float %t, float %slice, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 1
   ret float %elt0
@@ -2859,10 +3146,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32(i32, float, f
 ; llvm.amdgcn.image.sample.d
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt2_image_sample_d_cube_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.cube.f32.f32.f32(i32 4, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt2_image_sample_d_cube_v4f32_f32_f32(float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt2_image_sample_d_cube_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cube.f32.f32.f32(i32 4, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[FACE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 2
   ret float %elt0
@@ -2874,10 +3162,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32(i32, float, f
 ; llvm.amdgcn.image.sample.d.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt3_image_sample_d_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.cl.1d.f32.f32.f32(i32 8, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt3_image_sample_d_cl_1d_v4f32_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt3_image_sample_d_cl_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cl.1d.f32.f32.f32(i32 8, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 3
   ret float %elt0
@@ -2889,10 +3178,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32, float,
 ; llvm.amdgcn.image.sample.l
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt1_dmask_0110_image_sample_l_1d_v2f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.l.1d.f32.f32(i32 4, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_dmask_0110_image_sample_l_1d_v2f32_f32(float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt1_dmask_0110_image_sample_l_1d_v2f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.l.1d.f32.f32(i32 4, float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32(i32 6, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <2 x float> %data, i32 1
   ret float %elt0
@@ -2904,10 +3194,11 @@ declare <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32(i32, float, float,
 ; llvm.amdgcn.image.sample.b
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt1_dmask_1001_image_sample_b_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.b.1d.f32.f32.f32(i32 8, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_dmask_1001_image_sample_b_1d_v4f32_f32_f32(float %bias, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt1_dmask_1001_image_sample_b_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.1d.f32.f32.f32(i32 8, float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 9, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 1
   ret float %elt0
@@ -2919,10 +3210,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32, float, flo
 ; llvm.amdgcn.image.sample.b.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.b.cl.1d.v2f32.f32.f32(i32 12, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32(float %bias, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.b.cl.1d.v2f32.f32.f32(i32 12, float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 13, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
@@ -2934,10 +3226,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32, float,
 ; llvm.amdgcn.image.sample.lz
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.lz.1d.v2f32.f32(i32 10, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.1d.v2f32.f32(i32 10, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuf
@@ -2949,10 +3242,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32, float, <8 x i
 ; llvm.amdgcn.image.sample.cd
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.image.sample.cd.1d.v3f32.f32.f32(i32 14, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32(float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.cd.1d.v3f32.f32.f32(i32 14, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
@@ -2964,67 +3258,74 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, fl
 ; llvm.amdgcn.image.sample.cd.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 8, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 8, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x half> %data, i32 3
   ret half %elt0
 }
 
-; CHECK-LABEL: @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 4, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 4, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x half> %data, i32 2
   ret half %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 2, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 2, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x half> %data, i32 1
   ret half %elt0
 }
 
-; CHECK-LABEL: @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT: %data = call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32(i32 7, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: %res = shufflevector <3 x half> %data, <3 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
-; CHECK-NEXT: ret <4 x half> %res
 define amdgpu_ps <4 x half> @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32(i32 7, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <3 x half> [[DATA]], <3 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+; CHECK-NEXT:    ret <4 x half> [[RES]]
+;
   %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
   ret <4 x half> %res
 }
 
-; CHECK-LABEL: @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v2f16.f32.f32(i32 3, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: %res = shufflevector <2 x half> %data, <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT: ret <4 x half> %res
 define amdgpu_ps <4 x half> @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v2f16.f32.f32(i32 3, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x half> [[DATA]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <4 x half> [[RES]]
+;
   %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x half> %res
 }
 
-; CHECK-LABEL: @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: %res = insertelement <4 x half> poison, half %data, i64 0
-; CHECK-NEXT: ret <4 x half> %res
 define amdgpu_ps <4 x half> @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = insertelement <4 x half> poison, half [[DATA]], i64 0
+; CHECK-NEXT:    ret <4 x half> [[RES]]
+;
   %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
   ret <4 x half> %res
 }
 
-; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x half> %data, i32 0
   ret half %elt0
@@ -3036,10 +3337,11 @@ declare <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32, float,
 ; llvm.amdgcn.image.sample.c
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.1d.f32.f32(i32 1, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_1d_v4f32_f32(float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3051,10 +3353,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float,
 ; llvm.amdgcn.image.sample.c.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_cl_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cl.1d.f32.f32(i32 1, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_cl_1d_v4f32_f32(float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_cl_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cl.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3066,10 +3369,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32, float, floa
 ; llvm.amdgcn.image.sample.c.d
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_d_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_d_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_d_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3081,10 +3385,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32, float, f
 ; llvm.amdgcn.image.sample.c.d.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.cl.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_d_cl_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.cl.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3096,10 +3401,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32, float
 ; llvm.amdgcn.image.sample.c.l
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_l_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.l.1d.f32.f32(i32 1, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_l_1d_v4f32_f32(float %zcompare, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_l_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.l.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3111,10 +3417,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32, float, float
 ; llvm.amdgcn.image.sample.c.b
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_b_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.1d.f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_b_1d_v4f32_f32_f32(float %bias, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_b_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.1d.f32.f32.f32(i32 1, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3126,10 +3433,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32, float, f
 ; llvm.amdgcn.image.sample.c.b.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.cl.1d.f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_b_cl_1d_v4f32_f32_f32(float %bias, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.cl.1d.f32.f32.f32(i32 1, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3141,10 +3449,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32, float
 ; llvm.amdgcn.image.sample.c.lz
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_lz_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.lz.1d.f32.f32(i32 1, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_lz_1d_v4f32_f32(float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_lz_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.lz.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3156,10 +3465,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, floa
 ; llvm.amdgcn.image.sample.c.cd
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_cd_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_cd_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_cd_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3171,10 +3481,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32, float,
 ; llvm.amdgcn.image.sample.c.cd.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.cl.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_cd_cl_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.cl.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3186,10 +3497,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32, floa
 ; llvm.amdgcn.image.sample.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_o_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.o.1d.f32.f32(i32 1, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_o_1d_v4f32_f32(i32 %offset, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_o_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3201,10 +3513,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32, i32, float, <8
 ; llvm.amdgcn.image.sample.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_cl_o_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cl.o.1d.f32.f32(i32 1, i32 %offset, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_cl_o_1d_v4f32_f32(i32 %offset, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_cl_o_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cl.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3216,10 +3529,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32, i32, float,
 ; llvm.amdgcn.image.sample.d.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_d_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_d_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_d_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3231,10 +3545,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32, i32, flo
 ; llvm.amdgcn.image.sample.d.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_d_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_d_cl_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_d_cl_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3246,10 +3561,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32, i32,
 ; llvm.amdgcn.image.sample.l.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_l_o_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.l.o.1d.f32.f32(i32 1, i32 %offset, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_l_o_1d_v4f32_f32(i32 %offset, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_l_o_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.l.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3261,10 +3577,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32, i32, float,
 ; llvm.amdgcn.image.sample.b.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_b_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.b.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_b_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_b_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3276,10 +3593,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32, i32, flo
 ; llvm.amdgcn.image.sample.b.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_b_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.b.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_b_cl_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_b_cl_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3291,10 +3609,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32, i32,
 ; llvm.amdgcn.image.sample.lz.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_lz_o_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.lz.o.1d.f32.f32(i32 1, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_lz_o_1d_v4f32_f32(i32 %offset, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_lz_o_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.lz.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3306,10 +3625,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32, i32, float,
 ; llvm.amdgcn.image.sample.cd.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_cd_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cd.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_cd_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_cd_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cd.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3321,10 +3641,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32, i32, fl
 ; llvm.amdgcn.image.sample.cd.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cd.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_cd_cl_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cd.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3336,10 +3657,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32, i32,
 ; llvm.amdgcn.image.sample.c.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_o_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_o_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3351,10 +3673,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32, i32, float,
 ; llvm.amdgcn.image.sample.c.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_cl_o_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cl.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_cl_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_cl_o_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cl.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3366,10 +3689,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32, i32, floa
 ; llvm.amdgcn.image.sample.c.d.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_d_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_d_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_d_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3381,10 +3705,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32, i32, f
 ; llvm.amdgcn.image.sample.c.d.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_d_cl_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3396,10 +3721,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32, i32
 ; llvm.amdgcn.image.sample.c.l.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_l_o_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.l.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_l_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_l_o_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.l.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3411,10 +3737,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32, i32, float
 ; llvm.amdgcn.image.sample.c.b.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_b_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_b_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_b_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3426,10 +3753,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32, i32, f
 ; llvm.amdgcn.image.sample.c.b.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_b_cl_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3441,10 +3769,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32, i32
 ; llvm.amdgcn.image.sample.c.lz.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_lz_o_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.lz.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_lz_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_lz_o_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.lz.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3456,10 +3785,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32, i32, floa
 ; llvm.amdgcn.image.sample.c.cd.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_cd_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_cd_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_cd_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3471,10 +3801,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32, i32,
 ; llvm.amdgcn.image.sample.c.cd.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_cd_cl_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3488,9 +3819,12 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32, i3
 
 ; Don't handle gather4*
 
-; CHECK-LABEL: @extract_elt0_image_gather4_2d_v4f32_f32(
-; CHECK: %data = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_2d_v4f32_f32(float %s, float %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3502,9 +3836,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <
 ; llvm.amdgcn.image.gather4.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_cl_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 2, float %s, float %t, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_cl_2d_v4f32_f32(float %s, float %t, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_cl_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 2, float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 2, float %s, float %t, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3516,9 +3853,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32, float, float
 ; llvm.amdgcn.image.gather4.l
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_l_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 4, float %s, float %t, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_l_2d_v4f32_f32(float %s, float %t, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_l_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 4, float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 4, float %s, float %t, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3530,9 +3870,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32, float, float,
 ; llvm.amdgcn.image.gather4.b
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_b_2darray_v4f32_f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 8, float %bias, float %s, float %t, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_b_2darray_v4f32_f32_f32(float %bias, float %s, float %t, float %slice, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_b_2darray_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 8, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 8, float %bias, float %s, float %t, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3544,9 +3887,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32, floa
 ; llvm.amdgcn.image.gather4.b.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_cube_v4f32_f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_b_cl_cube_v4f32_f32_f32(float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_cube_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32 1, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[FACE:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3558,9 +3904,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32, floa
 ; llvm.amdgcn.image.gather4.lz
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_lz_2d_v4f32_f16(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_lz_2d_v4f32_f16(half %s, half %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_lz_2d_v4f32_f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3572,9 +3921,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32, half, half,
 ; llvm.amdgcn.image.gather4.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_o_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_o_2d_v4f32_f32(i32 %offset, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_o_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3586,9 +3938,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32, i32, float, f
 ; llvm.amdgcn.image.gather4.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_cl_o_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_cl_o_2d_v4f32_f32(i32 %offset, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_cl_o_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3600,9 +3955,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32, i32, float
 ; llvm.amdgcn.image.gather4.l.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_l_o_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_l_o_2d_v4f32_f32(i32 %offset, float %s, float %t, float %lod, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_l_o_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3614,9 +3972,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32, i32, float,
 ; llvm.amdgcn.image.gather4.b.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_b_o_2d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_b_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_b_o_2d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3628,9 +3989,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32, i32, fl
 ; llvm.amdgcn.image.gather4.b.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_o_2d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_b_cl_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_o_2d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3642,9 +4006,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32, i32,
 ; llvm.amdgcn.image.gather4.lz.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_lz_o_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_lz_o_2d_v4f32_f32(i32 %offset, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_lz_o_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3656,9 +4023,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32, i32, float
 ; llvm.amdgcn.image.gather4.c.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_c_o_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_c_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_c_o_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3670,9 +4040,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32, i32, float,
 ; llvm.amdgcn.image.gather4.c.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_c_cl_o_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_c_cl_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_c_cl_o_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3684,9 +4057,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32, i32, flo
 ; llvm.amdgcn.image.gather4.c.l.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_c_l_o_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_c_l_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_c_l_o_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3698,9 +4074,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32, i32, floa
 ; llvm.amdgcn.image.gather4.c.b.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_c_b_o_2d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_c_b_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_c_b_o_2d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3712,9 +4091,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32, i32,
 ; llvm.amdgcn.image.gather4.c.b.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_c_b_cl_o_2d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_c_b_cl_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_c_b_cl_o_2d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3726,9 +4108,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32, i3
 ; llvm.amdgcn.image.gather4.c.lz.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_c_lz_o_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_c_lz_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_c_lz_o_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3740,10 +4125,11 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32, i32, flo
 ; llvm.amdgcn.image.getlod
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_getlod_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.getlod.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_getlod_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_getlod_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.getlod.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3755,10 +4141,11 @@ declare <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32, float, <8 x i32>
 ; llvm.amdgcn.image.load
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_load_2dmsaa_v4f32_i32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 %sample, <8 x i32> %sampler, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_load_2dmsaa_v4f32_i32(i32 %s, i32 %t, i32 %sample, <8 x i32> inreg %sampler) #0 {
+; CHECK-LABEL: @extract_elt0_image_load_2dmsaa_v4f32_i32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SAMPLE:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %sample, <8 x i32> %sampler, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3770,10 +4157,11 @@ declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32,
 ; llvm.amdgcn.image.load.mip
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_load_mip_1d_v4f32_i32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.load.mip.1d.f32.i32(i32 1, i32 %s, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_load_mip_1d_v4f32_i32(i32 %s, i32 %mip, <8 x i32> inreg %sampler) #0 {
+; CHECK-LABEL: @extract_elt0_image_load_mip_1d_v4f32_i32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.load.mip.1d.f32.i32(i32 1, i32 [[S:%.*]], i32 [[MIP:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3785,10 +4173,11 @@ declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x
 ; llvm.amdgcn.image.getresinfo
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_getresinfo_1d_v4f32_i32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.getresinfo.1d.f32.i32(i32 1, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_getresinfo_1d_v4f32_i32(i32 %mip, <8 x i32> inreg %sampler) #0 {
+; CHECK-LABEL: @extract_elt0_image_getresinfo_1d_v4f32_i32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.getresinfo.1d.f32.i32(i32 1, i32 [[MIP:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3800,9 +4189,13 @@ declare <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32, i32, <8 x i3
 ; TFE / LWE
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_tfe_image_load_1d_v4f32i32_i32(
-; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
 define amdgpu_ps float @extract_elt0_tfe_image_load_1d_v4f32i32_i32(i32 %s, <8 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_tfe_image_load_1d_v4f32i32_i32(
+; CHECK-NEXT:    [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 1)
+; CHECK-NEXT:    [[RGBA:%.*]] = extractvalue { <4 x float>, i32 } [[DATA]], 0
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[RGBA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
   %rgba = extractvalue { <4 x float>, i32 } %data, 0
   %elt0 = extractelement <4 x float> %rgba, i32 0
@@ -3811,10 +4204,11 @@ define amdgpu_ps float @extract_elt0_tfe_image_load_1d_v4f32i32_i32(i32 %s, <8 x
 
 declare {<4 x float>, i32} @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32, i32, <8 x i32>, i32, i32) #1
 
-; CHECK: @tfe_check_assert(
-; CHECK: %data = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1)
-; CHECK-NEXT: ret float %data
 define amdgpu_hs float @tfe_check_assert() #0 {
+; CHECK-LABEL: @tfe_check_assert(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call nsz <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1) #2
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
index 855d396ed63d69..999b574c0affc7 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
@@ -1,165 +1,183 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck %s
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.buffer.load
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @buffer_load_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @buffer_load_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   ret float %data
 }
 
-; CHECK-LABEL: @buffer_load_v1f32(
-; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret <1 x float> %data
 define amdgpu_ps <1 x float> @buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @buffer_load_v1f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret <1 x float> [[DATA]]
+;
   %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   ret <1 x float> %data
 }
 
-; CHECK-LABEL: @buffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   ret <2 x float> %data
 }
 
-; CHECK-LABEL: @buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret <4 x float> %data
 define amdgpu_ps <4 x float> @buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret <4 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   ret <4 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_buffer_load_v2f32(
-; CHECK: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_buffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt1 = extractelement <2 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_buffer_load_v4f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt1 = extractelement <4 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt2_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt1 = extractelement <4 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt3_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt1 = extractelement <4 x float> %data, i32 3
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %elt0 = extractelement <2 x float> %data, i32 0
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: %ins0 = insertvalue { float, float } undef, float %elt0, 0
-; CHECK-NEXT: %ins1 = insertvalue { float, float } %ins0, float %elt1, 1
-; CHECK-NEXT: ret { float, float } %ins1
 define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <2 x float> [[DATA]], i32 0
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    [[INS0:%.*]] = insertvalue { float, float } undef, float [[ELT0]], 0
+; CHECK-NEXT:    [[INS1:%.*]] = insertvalue { float, float } [[INS0]], float [[ELT1]], 1
+; CHECK-NEXT:    ret { float, float } [[INS1]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %data, i32 0
   %elt1 = extractelement <4 x float> %data, i32 1
@@ -168,16 +186,17 @@ define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i3
   ret { float, float } %ins1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_2(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %elt0 = extractelement <3 x float> %data, i32 0
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 1
-; CHECK-NEXT: %elt2 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: %ins0 = insertvalue { float, float, float } undef, float %elt0, 0
-; CHECK-NEXT: %ins1 = insertvalue { float, float, float } %ins0, float %elt1, 1
-; CHECK-NEXT: %ins2 = insertvalue { float, float, float } %ins1, float %elt2, 2
-; CHECK-NEXT: ret { float, float, float } %ins2
 define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_2(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <3 x float> [[DATA]], i32 0
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 1
+; CHECK-NEXT:    [[ELT2:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    [[INS0:%.*]] = insertvalue { float, float, float } undef, float [[ELT0]], 0
+; CHECK-NEXT:    [[INS1:%.*]] = insertvalue { float, float, float } [[INS0]], float [[ELT1]], 1
+; CHECK-NEXT:    [[INS2:%.*]] = insertvalue { float, float, float } [[INS1]], float [[ELT2]], 2
+; CHECK-NEXT:    ret { float, float, float } [[INS2]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %data, i32 0
   %elt1 = extractelement <4 x float> %data, i32 1
@@ -188,12 +207,14 @@ define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f
   ret { float, float, float } %ins2
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_3(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %ins1 = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 undef, i32 1>
-; CHECK-NEXT: %ret = fadd <2 x float> %ins1, %shuf
 define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_3(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 undef, i32 1>
+; CHECK-NEXT:    [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]]
+; CHECK-NEXT:    ret <2 x float> [[RET]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %data, i32 0
   %elt2 = extractelement <4 x float> %data, i32 2
@@ -204,13 +225,14 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(<4 x i3
   ret <2 x float> %ret
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_4(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %ins1 = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT: %ret = fadd <2 x float> %ins1, %shuf
-; CHECK-NEXT: ret <2 x float> %ret
 define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_4(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]]
+; CHECK-NEXT:    ret <2 x float> [[RET]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %data, i32 0
   %elt2 = extractelement <4 x float> %data, i32 2
@@ -221,12 +243,14 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(<4 x i3
   ret <2 x float> %ret
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_5(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %ins1 = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 2, i32 2>
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: %ret = fadd <2 x float> %ins1, %shuf
 define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_5(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> undef, <2 x i32> <i32 2, i32 2>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]]
+; CHECK-NEXT:    ret <2 x float> [[RET]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt2 = extractelement <4 x float> %data, i32 2
   %ins0 = insertelement <2 x float> undef, float %elt2, i32 0
@@ -236,58 +260,64 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(<4 x i3
   ret <2 x float> %ret
 }
 
-; CHECK-LABEL: @extract_elt0_buffer_load_v3f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt0 = extractelement <3 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_buffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt1 = extractelement <3 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_buffer_load_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt2_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %elt1 = extractelement <3 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @preserve_metadata_extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false), !fpmath !0
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
@@ -303,76 +333,83 @@ declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1)
 ; llvm.amdgcn.buffer.load.format
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @buffer_load_format_v1f32(
-; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true)
-; CHECK-NEXT: ret <1 x float> %data
 define amdgpu_ps <1 x float> @buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @buffer_load_format_v1f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 true)
+; CHECK-NEXT:    ret <1 x float> [[DATA]]
+;
   %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true)
   ret <1 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 true, i1 false)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
 ; The initial insertion point is at the extractelement
-; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32(
-; CHECK-NEXT: %tmp = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
-; CHECK-NEXT: %1 = shufflevector <2 x float> %tmp, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT: %tmp1 = bitcast <4 x float> %1 to <2 x double>
-; CHECK-NEXT: %tmp2 = extractelement <2 x double> %tmp1, i32 0
-; CHECK-NEXT: ret double %tmp2
 define double @extract01_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
-  %tmp1 = bitcast <4 x float> %tmp to <2 x double>
-  %tmp2 = extractelement <2 x double> %tmp1, i32 0
-  ret double %tmp2
+; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[VAR]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[VAR1:%.*]] = bitcast <4 x float> [[TMP1]] to <2 x double>
+; CHECK-NEXT:    [[VAR2:%.*]] = extractelement <2 x double> [[VAR1]], i32 0
+; CHECK-NEXT:    ret double [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
+  %var1 = bitcast <4 x float> %var to <2 x double>
+  %var2 = extractelement <2 x double> %var1, i32 0
+  ret double %var2
 }
 
-; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
-; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
-; CHECK-NEXT: ret i32 %tmp2
 define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
-  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
-  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
-  ret i32 %tmp2
+; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    ret i32 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
+  %var1 = bitcast <4 x float> %var to <4 x i32>
+  %var2 = extractelement <4 x i32> %var1, i32 0
+  ret i32 %var2
 }
 
-; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
-; CHECK-NEXT: %1 = bitcast float %tmp to i32
-; CHECK-NEXT: %tmp2 = trunc i32 %1 to i16
-; CHECK-NEXT: ret i16 %tmp2
 define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
-  %tmp1 = bitcast <4 x float> %tmp to <8 x i16>
-  %tmp2 = extractelement <8 x i16> %tmp1, i32 0
-  ret i16 %tmp2
+; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    [[VAR2:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT:    ret i16 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
+  %var1 = bitcast <4 x float> %var to <8 x i16>
+  %var2 = extractelement <8 x i16> %var1, i32 0
+  ret i16 %var2
 }
 
 declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1
@@ -385,228 +422,252 @@ declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i
 ; llvm.amdgcn.raw.buffer.load
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @raw_buffer_load_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @raw_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @raw_buffer_load_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   ret float %data
 }
 
-; CHECK-LABEL: @raw_buffer_load_v1f32(
-; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <1 x float> %data
 define amdgpu_ps <1 x float> @raw_buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @raw_buffer_load_v1f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <1 x float> [[DATA]]
+;
   %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   ret <1 x float> %data
 }
 
-; CHECK-LABEL: @raw_buffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @raw_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   ret <2 x float> %data
 }
 
-; CHECK-LABEL: @raw_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <4 x float> %data
 define amdgpu_ps <4 x float> @raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <4 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   ret <4 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f32(
-; CHECK: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <2 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_raw_buffer_load_v4f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_raw_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 8
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 12
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 3
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 8
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_raw_buffer_load_v3f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <3 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_raw_buffer_load_v3f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 8
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_raw_buffer_load_v3f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v3f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v3f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
-; CHECK-NEXT: ret i32 %tmp2
 define i32 @extract0_bitcast_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
-  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
-  ret i32 %tmp2
+; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    ret i32 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %var1 = bitcast <4 x float> %var to <4 x i32>
+  %var2 = extractelement <4 x i32> %var1, i32 0
+  ret i32 %var2
 }
 
-; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4i32(
-; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float
-; CHECK-NEXT: ret float %tmp2
 define float @extract0_bitcast_raw_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
-  %tmp = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %tmp1 = bitcast <4 x i32> %tmp to <4 x float>
-  %tmp2 = extractelement <4 x float> %tmp1, i32 0
-  ret float %tmp2
+; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4i32(
+; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast i32 [[VAR]] to float
+; CHECK-NEXT:    ret float [[VAR2]]
+;
+  %var = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %var1 = bitcast <4 x i32> %var to <4 x float>
+  %var2 = extractelement <4 x float> %var1, i32 0
+  ret float %var2
 }
 
-; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
@@ -620,59 +681,65 @@ declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32)
 
 declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) #1
 
-; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f16(
-; CHECK: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt0_raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <2 x half> %data, i32 0
   ret half %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 2
-; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <2 x half> %data, i32 1
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 2
-; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_raw_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <3 x half> %data, i32 1
   ret half %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 2
-; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 1
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 6
-; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt3_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 3
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f16(
-; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x half>
 define amdgpu_ps <2 x half> @extract_elt0_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x half> [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x half> %shuf
@@ -683,59 +750,65 @@ declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32>, i32, i32, i32)
 declare <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32>, i32, i32, i32) #1
 declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32>, i32, i32, i32) #1
 
-; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2i8(
-; CHECK: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt0_raw_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2i8(
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <2 x i8> %data, i32 0
   ret i8 %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 1
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <2 x i8> %data, i32 1
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 1
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <3 x i8> %data, i32 1
   ret i8 %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 1
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x i8> %data, i32 1
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 3
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt3_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x i8> %data, i32 3
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4i8(
-; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x i8>
 define amdgpu_ps <2 x i8> @extract_elt0_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4i8(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x i8> [[DATA]]
+;
   %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x i8> %shuf
@@ -750,179 +823,198 @@ declare <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32>, i32, i32, i32) #1
 ; llvm.amdgcn.s.buffer.load
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @s_buffer_load_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @s_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @s_buffer_load_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   ret float %data
 }
 
-; CHECK-LABEL: @s_buffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @s_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   ret <2 x float> %data
 }
 
-; CHECK-LABEL: @s_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret <4 x float> %data
 define amdgpu_ps <4 x float> @s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @s_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <4 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   ret <4 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f32(
-; CHECK: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <2 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_s_buffer_load_v4f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_s_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 8
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt2_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 12
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 3
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt2_elt3_s_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 8
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_s_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt2_elt3_s_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt2_elt3_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_s_buffer_load_v3f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_s_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt0 = extractelement <3 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_s_buffer_load_v3f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 8
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt2_s_buffer_load_v3f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v3f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v3f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
@@ -930,42 +1022,46 @@ define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v3f32(<4 x i32> in
 
 ; Do not trim to vec3 s_buffer_load in instcombine, as the load will most likely be widened
 ; to vec4 anyway during lowering.
-; CHECK-LABEL: @extract_elt1_elt2_elt3_s_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
-; CHECK-NEXT: ret i32 %tmp2
 define i32 @extract0_bitcast_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
-  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
-  ret i32 %tmp2
+; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    ret i32 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
+  %var1 = bitcast <4 x float> %var to <4 x i32>
+  %var2 = extractelement <4 x i32> %var1, i32 0
+  ret i32 %var2
 }
 
-; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4i32(
-; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float
-; CHECK-NEXT: ret float %tmp2
 define float @extract0_bitcast_s_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
-  %tmp = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 0)
-  %tmp1 = bitcast <4 x i32> %tmp to <4 x float>
-  %tmp2 = extractelement <4 x float> %tmp1, i32 0
-  ret float %tmp2
+; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4i32(
+; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast i32 [[VAR]] to float
+; CHECK-NEXT:    ret float [[VAR2]]
+;
+  %var = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 0)
+  %var1 = bitcast <4 x i32> %var to <4 x float>
+  %var2 = extractelement <4 x float> %var1, i32 0
+  ret float %var2
 }
 
-; CHECK-LABEL: @preserve_metadata_extract_elt0_s_buffer_load_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0), !fpmath !0
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @preserve_metadata_extract_elt0_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @preserve_metadata_extract_elt0_s_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0), !fpmath !0
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0), !fpmath !0
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
@@ -977,60 +1073,66 @@ declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32) #1
 declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) #1
 
-; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f16(
-; CHECK: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt0_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt0 = extractelement <2 x half> %data, i32 0
   ret half %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 2
-; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <2 x half> %data, i32 1
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 2
-; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_s_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <3 x half> %data, i32 1
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 2
-; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 1
   ret half %elt1
 }
 
 
-; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 6
-; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt3_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 3
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f16(
-; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret <2 x half>
 define amdgpu_ps <2 x half> @extract_elt0_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x half> [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x half> %shuf
@@ -1041,59 +1143,65 @@ declare <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32>, i32, i32) #1
 declare <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32>, i32, i32) #1
 declare <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32>, i32, i32) #1
 
-; CHECK-LABEL: @extract_elt0_s_buffer_load_v2i8(
-; CHECK: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt0_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_s_buffer_load_v2i8(
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt0 = extractelement <2 x i8> %data, i32 0
   ret i8 %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_s_buffer_load_v2i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 1
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt1_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_s_buffer_load_v2i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <2 x i8> %data, i32 1
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_s_buffer_load_v3i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 1
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt1_s_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_s_buffer_load_v3i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <3 x i8> %data, i32 1
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_s_buffer_load_v4i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 1
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_s_buffer_load_v4i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <4 x i8> %data, i32 1
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_s_buffer_load_v4i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 3
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt3_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt3_s_buffer_load_v4i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %elt1 = extractelement <4 x i8> %data, i32 3
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4i8(
-; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
-; CHECK-NEXT: ret <2 x i8>
 define amdgpu_ps <2 x i8> @extract_elt0_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4i8(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x i8> [[DATA]]
+;
   %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
   %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x i8> %shuf
@@ -1108,226 +1216,250 @@ declare <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32>, i32, i32) #1
 ; llvm.amdgcn.raw.buffer.load.format
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @raw_buffer_load_format_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @raw_buffer_load_format_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @raw_buffer_load_format_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   ret float %data
 }
 
-; CHECK-LABEL: @raw_buffer_load_format_v1f32(
-; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <1 x float> %data
 define amdgpu_ps <1 x float> @raw_buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @raw_buffer_load_format_v1f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <1 x float> [[DATA]]
+;
   %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   ret <1 x float> %data
 }
 
-; CHECK-LABEL: @raw_buffer_load_format_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @raw_buffer_load_format_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   ret <2 x float> %data
 }
 
-; CHECK-LABEL: @raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <4 x float> %data
 define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <4 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   ret <4 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v2f32(
-; CHECK: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <2 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt3_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 3
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <3 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
-; CHECK-NEXT: ret i32 %tmp2
 define i32 @extract0_bitcast_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
-  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
-  ret i32 %tmp2
+; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    ret i32 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %var1 = bitcast <4 x float> %var to <4 x i32>
+  %var2 = extractelement <4 x i32> %var1, i32 0
+  ret i32 %var2
 }
 
-; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4i32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %tmp
 define float @extract0_bitcast_raw_buffer_load_format_v4i32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
-  %tmp1 = extractelement <4 x float> %tmp, i32 0
-  ret float %tmp1
+; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4i32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[VAR]]
+;
+  %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %var1 = extractelement <4 x float> %var, i32 0
+  ret float %var1
 }
 
-; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
@@ -1343,228 +1475,252 @@ declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i3
 ; llvm.amdgcn.struct.buffer.load
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @struct_buffer_load_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @struct_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @struct_buffer_load_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   ret float %data
 }
 
-; CHECK-LABEL: @struct_buffer_load_v1f32(
-; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <1 x float> %data
 define amdgpu_ps <1 x float> @struct_buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @struct_buffer_load_v1f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <1 x float> [[DATA]]
+;
   %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   ret <1 x float> %data
 }
 
-; CHECK-LABEL: @struct_buffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @struct_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   ret <2 x float> %data
 }
 
-; CHECK-LABEL: @struct_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <4 x float> %data
 define amdgpu_ps <4 x float> @struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <4 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   ret <4 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f32(
-; CHECK: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <2 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_struct_buffer_load_v4f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_struct_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 8
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 12
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 3
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 8
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_struct_buffer_load_v3f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <3 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_struct_buffer_load_v3f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 8
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_struct_buffer_load_v3f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v3f32(
-; CHECK-NEXT: %1 = add i32 %ofs, 4
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v3f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
-; CHECK-NEXT: ret i32 %tmp2
 define i32 @extract0_bitcast_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
-  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
-  ret i32 %tmp2
+; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    ret i32 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %var1 = bitcast <4 x float> %var to <4 x i32>
+  %var2 = extractelement <4 x i32> %var1, i32 0
+  ret i32 %var2
 }
 
-; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4i32(
-; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float
-; CHECK-NEXT: ret float %tmp2
 define float @extract0_bitcast_struct_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
-  %tmp = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %tmp1 = bitcast <4 x i32> %tmp to <4 x float>
-  %tmp2 = extractelement <4 x float> %tmp1, i32 0
-  ret float %tmp2
+; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4i32(
+; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast i32 [[VAR]] to float
+; CHECK-NEXT:    ret float [[VAR2]]
+;
+  %var = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %var1 = bitcast <4 x i32> %var to <4 x float>
+  %var2 = extractelement <4 x float> %var1, i32 0
+  ret float %var2
 }
 
-; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @preserve_metadata_extract_elt0_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
@@ -1578,59 +1734,65 @@ declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i
 
 declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1
 
-; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f16(
-; CHECK: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt0_struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <2 x half> %data, i32 0
   ret half %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 2
-; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <2 x half> %data, i32 1
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 2
-; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_struct_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x half> %data, i32 1
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 2
-; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 1
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f16(
-; CHECK-NEXT: %1 = add i32 %ofs, 6
-; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt3_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 3
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f16(
-; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x half>
 define amdgpu_ps <2 x half> @extract_elt0_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x half> [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x half> %shuf
@@ -1641,59 +1803,65 @@ declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32>, i32, i32, i3
 declare <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32>, i32, i32, i32, i32) #1
 declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) #1
 
-; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2i8(
-; CHECK: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt0_struct_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2i8(
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <2 x i8> %data, i32 0
   ret i8 %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 1
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <2 x i8> %data, i32 1
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 1
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x i8> %data, i32 1
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 1
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x i8> %data, i32 1
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4i8(
-; CHECK-NEXT: %1 = add i32 %ofs, 3
-; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
-; CHECK-NEXT: ret i8 %data
 define amdgpu_ps i8 @extract_elt3_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3
+; CHECK-NEXT:    [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret i8 [[DATA]]
+;
   %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x i8> %data, i32 3
   ret i8 %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4i8(
-; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x i8>
 define amdgpu_ps <2 x i8> @extract_elt0_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4i8(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x i8> [[DATA]]
+;
   %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x i8> %shuf
@@ -1708,217 +1876,240 @@ declare <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32>, i32, i32, i32,
 ; llvm.amdgcn.struct.buffer.load.format
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @struct_buffer_load_format_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @struct_buffer_load_format_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @struct_buffer_load_format_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   ret float %data
 }
 
-; CHECK-LABEL: @struct_buffer_load_format_v1f32(
-; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <1 x float> %data
 define amdgpu_ps <1 x float> @struct_buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @struct_buffer_load_format_v1f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <1 x float> [[DATA]]
+;
   %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   ret <1 x float> %data
 }
 
-; CHECK-LABEL: @struct_buffer_load_format_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @struct_buffer_load_format_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   ret <2 x float> %data
 }
 
-; CHECK-LABEL: @struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <4 x float> %data
 define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <4 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   ret <4 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v2f32(
-; CHECK: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <2 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt3_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 3
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt0 = extractelement <3 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_format_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
-; CHECK-NEXT: ret i32 %tmp2
 define i32 @extract0_bitcast_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
-  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
-  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
-  ret i32 %tmp2
+; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_format_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    ret i32 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %var1 = bitcast <4 x float> %var to <4 x i32>
+  %var2 = extractelement <4 x i32> %var1, i32 0
+  ret i32 %var2
 }
 
-; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
@@ -1936,209 +2127,231 @@ declare <4 x i32> @llvm.amdgcn.struct.buffer.load.format.v4i32(<4 x i32>, i32, i
 ; llvm.amdgcn.raw.tbuffer.load
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @raw_tbuffer_load_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @raw_tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @raw_tbuffer_load_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   ret float %data
 }
 
-; CHECK-LABEL: @raw_tbuffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @raw_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   ret <2 x float> %data
 }
 
-; CHECK-LABEL: @raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret <4 x float> %data
 define amdgpu_ps <4 x float> @raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <4 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   ret <4 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v2f32(
-; CHECK: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <2 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 3
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt2_elt3_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt0 = extractelement <3 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract0_bitcast_raw_tbuffer_load_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
-; CHECK-NEXT: ret i32 %tmp2
 define i32 @extract0_bitcast_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
-  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
-  ret i32 %tmp2
+; CHECK-LABEL: @extract0_bitcast_raw_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    ret i32 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
+  %var1 = bitcast <4 x float> %var to <4 x i32>
+  %var2 = extractelement <4 x i32> %var1, i32 0
+  ret i32 %var2
 }
 
-; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_tbuffer_load_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0), !fpmath !0
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @preserve_metadata_extract_elt0_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0), !fpmath !0
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0), !fpmath !0
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
@@ -2151,40 +2364,44 @@ declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32
 
 declare <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1
 
-; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f16(
-; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <4 x half> %data, i32 3
-; CHECK-NEXT: ret half %elt1
 define amdgpu_ps half @extract_elt3_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <4 x half> [[DATA]], i32 3
+; CHECK-NEXT:    ret half [[ELT1]]
+;
   %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 3
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f16(
-; CHECK-NEXT: %data = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <3 x half> %data, i32 2
-; CHECK-NEXT: ret half %elt1
 define amdgpu_ps half @extract_elt2_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x half> [[DATA]], i32 2
+; CHECK-NEXT:    ret half [[ELT1]]
+;
   %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 2
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f16(
-; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x half> %data, i32 1
-; CHECK-NEXT: ret half %elt1
 define amdgpu_ps half @extract_elt1_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x half> [[DATA]], i32 1
+; CHECK-NEXT:    ret half [[ELT1]]
+;
   %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 1
   ret half %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f16(
-; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt0_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
   %elt1 = extractelement <4 x half> %data, i32 0
   ret half %elt1
@@ -2199,209 +2416,231 @@ declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32,
 ; llvm.amdgcn.struct.tbuffer.load
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @struct_tbuffer_load_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @struct_tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @struct_tbuffer_load_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   ret float %data
 }
 
-; CHECK-LABEL: @struct_tbuffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @struct_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   ret <2 x float> %data
 }
 
-; CHECK-LABEL: @struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: ret <4 x float> %data
 define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <4 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   ret <4 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v2f32(
-; CHECK: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %elt1 = extractelement <2 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt3_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %elt1 = extractelement <4 x float> %data, i32 3
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt2_elt3_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %elt0 = extractelement <3 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %elt1 = extractelement <3 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract0_bitcast_struct_tbuffer_load_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
-; CHECK-NEXT: ret i32 %tmp2
 define i32 @extract0_bitcast_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
-  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
-  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
-  ret i32 %tmp2
+; CHECK-LABEL: @extract0_bitcast_struct_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    ret i32 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0)
+  %var1 = bitcast <4 x float> %var to <4 x i32>
+  %var2 = extractelement <4 x i32> %var1, i32 0
+  ret i32 %var2
 }
 
-; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_tbuffer_load_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0), !fpmath !0
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @preserve_metadata_extract_elt0_struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 {
+; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0), !fpmath !0
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0), !fpmath !0
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
@@ -2418,209 +2657,231 @@ declare <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32>, i32, i32, i3
 ; llvm.amdgcn.tbuffer.load
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @tbuffer_load_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @tbuffer_load_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   ret float %data
 }
 
-; CHECK-LABEL: @tbuffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   ret <2 x float> %data
 }
 
-; CHECK-LABEL: @tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <4 x float> %data
 define amdgpu_ps <4 x float> @tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    ret <4 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   ret <4 x float> %data
 }
 
-; CHECK-LABEL: @extract_elt0_tbuffer_load_v2f32(
-; CHECK: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_tbuffer_load_v2f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %elt1 = extractelement <2 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %elt1 = extractelement <4 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt2_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %elt1 = extractelement <4 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt3_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %elt1 = extractelement <4 x float> %data, i32 3
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt2_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt2_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_elt2_elt3_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %elt0 = extractelement <3 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %elt1 = extractelement <3 x float> %data, i32 1
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt2_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
-; CHECK-NEXT: ret float %elt1
 define amdgpu_ps float @extract_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt2_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2
+; CHECK-NEXT:    ret float [[ELT1]]
+;
   %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %elt1 = extractelement <3 x float> %data, i32 2
   ret float %elt1
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float>
 define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v3f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> %shuf
 define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v3f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    ret <2 x float> [[SHUF]]
+;
   %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
   %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract0_bitcast_tbuffer_load_v4f32(
-; CHECK-NEXT: %tmp = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
-; CHECK-NEXT: ret i32 %tmp2
 define i32 @extract0_bitcast_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-  %tmp = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
-  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
-  ret i32 %tmp2
+; CHECK-LABEL: @extract0_bitcast_tbuffer_load_v4f32(
+; CHECK-NEXT:    [[VAR:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+; CHECK-NEXT:    [[VAR2:%.*]] = bitcast float [[VAR]] to i32
+; CHECK-NEXT:    ret i32 [[VAR2]]
+;
+  %var = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
+  %var1 = bitcast <4 x float> %var to <4 x i32>
+  %var2 = extractelement <4 x i32> %var1, i32 0
+  ret i32 %var2
 }
 
-; CHECK-LABEL: @preserve_metadata_extract_elt0_tbuffer_load_v2f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @preserve_metadata_extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
+; CHECK-LABEL: @preserve_metadata_extract_elt0_tbuffer_load_v2f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0
   %elt0 = extractelement <2 x float> %data, i32 0
   ret float %elt0
@@ -2638,20 +2899,24 @@ declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32,
 ; llvm.amdgcn.image.sample
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
 ; Check that the intrinsic remains unchanged in the presence of TFE or LWE
-; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_tfe(
-; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0)
-; CHECK: ret float %elt0
 define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_tfe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_tfe(
+; CHECK-NEXT:    [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 1, i32 0)
+; CHECK-NEXT:    [[DATA_VEC:%.*]] = extractvalue { <4 x float>, i32 } [[DATA]], 0
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA_VEC]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0)
   %data.vec = extractvalue {<4 x float>,i32} %data, 0
   %elt0 = extractelement <4 x float> %data.vec, i32 0
@@ -2659,176 +2924,197 @@ define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_tfe(float %vaddr,
 }
 
 ; Check that the intrinsic remains unchanged in the presence of TFE or LWE
-; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_lwe(
-; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0)
-; CHECK: ret float %elt0
 define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_lwe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_lwe(
+; CHECK-NEXT:    [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 2, i32 0)
+; CHECK-NEXT:    [[DATA_VEC:%.*]] = extractvalue { <4 x float>, i32 } [[DATA]], 0
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA_VEC]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0)
   %data.vec = extractvalue {<4 x float>,i32} %data, 0
   %elt0 = extractelement <4 x float> %data.vec, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_image_sample_2d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_2d_v4f32_f32(float %s, float %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32(
-; CHECK-NEXT: ret float undef
 define amdgpu_ps float @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32(float %s, float %t, float %r, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32(
+; CHECK-NEXT:    ret float undef
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 0, float %s, float %t, float %r, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_dmask_0001_image_sample_1darray_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1darray.f32.f32(i32 1, float %s, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_dmask_0001_image_sample_1darray_v4f32_f32(float %s, float %slice, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_dmask_0001_image_sample_1darray_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1darray.f32.f32(i32 1, float [[S:%.*]], float [[SLICE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 1, float %s, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_dmask_0010_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 2, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_dmask_0010_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_dmask_0010_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 2, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 2, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_dmask_0100_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 4, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_dmask_0100_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_dmask_0100_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 4, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 4, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_dmask_1000_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 8, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_dmask_1000_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_dmask_1000_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 8, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 8, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_dmask_1001_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_dmask_1001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_dmask_1001_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 9, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_dmask_0011_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_dmask_0011_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: %1 = insertelement <2 x float> undef, float %data, i32 0
-; CHECK-NEXT: ret <2 x float> %1
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[DATA]], i32 0
+; CHECK-NEXT:    ret <2 x float> [[TMP1]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
   ret <2 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: %1 = insertelement <3 x float> undef, float %data, i32 0
-; CHECK-NEXT: ret <3 x float> %1
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> undef, float [[DATA]], i32 0
+; CHECK-NEXT:    ret <3 x float> [[TMP1]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <2 x float> %data, <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 undef>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x float> [[DATA]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 undef>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: %shuf = shufflevector <2 x float> %data, <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 undef>
-; CHECK-NEXT: ret <3 x float> %shuf
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x float> [[DATA]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 undef>
+; CHECK-NEXT:    ret <3 x float> [[SHUF]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
 }
 
-; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
   ret <3 x float> %shuf
@@ -2844,10 +3130,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, floa
 ; llvm.amdgcn.image.sample.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt1_image_sample_cl_2darray_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cl.2darray.f32.f32(i32 2, float %s, float %t, float %slice, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_image_sample_cl_2darray_v4f32_f32(float %s, float %t, float %slice, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt1_image_sample_cl_2darray_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cl.2darray.f32.f32(i32 2, float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32(i32 15, float %s, float %t, float %slice, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 1
   ret float %elt0
@@ -2859,10 +3146,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32(i32, float, f
 ; llvm.amdgcn.image.sample.d
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt2_image_sample_d_cube_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.cube.f32.f32.f32(i32 4, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt2_image_sample_d_cube_v4f32_f32_f32(float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt2_image_sample_d_cube_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cube.f32.f32.f32(i32 4, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[FACE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 2
   ret float %elt0
@@ -2874,10 +3162,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32(i32, float, f
 ; llvm.amdgcn.image.sample.d.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt3_image_sample_d_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.cl.1d.f32.f32.f32(i32 8, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt3_image_sample_d_cl_1d_v4f32_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt3_image_sample_d_cl_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cl.1d.f32.f32.f32(i32 8, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 3
   ret float %elt0
@@ -2889,10 +3178,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32, float,
 ; llvm.amdgcn.image.sample.l
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt1_dmask_0110_image_sample_l_1d_v2f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.l.1d.f32.f32(i32 4, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_dmask_0110_image_sample_l_1d_v2f32_f32(float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt1_dmask_0110_image_sample_l_1d_v2f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.l.1d.f32.f32(i32 4, float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32(i32 6, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <2 x float> %data, i32 1
   ret float %elt0
@@ -2904,10 +3194,11 @@ declare <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32(i32, float, float,
 ; llvm.amdgcn.image.sample.b
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt1_dmask_1001_image_sample_b_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.b.1d.f32.f32.f32(i32 8, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt1_dmask_1001_image_sample_b_1d_v4f32_f32_f32(float %bias, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt1_dmask_1001_image_sample_b_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.1d.f32.f32.f32(i32 8, float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 9, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 1
   ret float %elt0
@@ -2919,10 +3210,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32, float, flo
 ; llvm.amdgcn.image.sample.b.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.b.cl.1d.v2f32.f32.f32(i32 12, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32(float %bias, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.b.cl.1d.v2f32.f32.f32(i32 12, float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 13, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %shuf
@@ -2934,10 +3226,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32, float,
 ; llvm.amdgcn.image.sample.lz
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32(
-; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.lz.1d.v2f32.f32(i32 10, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret <2 x float> %data
 define amdgpu_ps <2 x float> @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.1d.v2f32.f32(i32 10, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret <2 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuf
@@ -2949,10 +3242,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32, float, <8 x i
 ; llvm.amdgcn.image.sample.cd
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.image.sample.cd.1d.v3f32.f32.f32(i32 14, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret <3 x float> %data
 define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32(float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.cd.1d.v3f32.f32.f32(i32 14, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret <3 x float> [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
   ret <3 x float> %shuf
@@ -2964,67 +3258,74 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, fl
 ; llvm.amdgcn.image.sample.cd.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 8, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 8, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x half> %data, i32 3
   ret half %elt0
 }
 
-; CHECK-LABEL: @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 4, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 4, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x half> %data, i32 2
   ret half %elt0
 }
 
-; CHECK-LABEL: @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 2, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 2, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x half> %data, i32 1
   ret half %elt0
 }
 
-; CHECK-LABEL: @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT: %data = call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32(i32 7, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: %res = shufflevector <3 x half> %data, <3 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
-; CHECK-NEXT: ret <4 x half> %res
 define amdgpu_ps <4 x half> @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32(i32 7, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <3 x half> [[DATA]], <3 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+; CHECK-NEXT:    ret <4 x half> [[RES]]
+;
   %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
   ret <4 x half> %res
 }
 
-; CHECK-LABEL: @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v2f16.f32.f32(i32 3, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: %res = shufflevector <2 x half> %data, <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT: ret <4 x half> %res
 define amdgpu_ps <4 x half> @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v2f16.f32.f32(i32 3, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x half> [[DATA]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <4 x half> [[RES]]
+;
   %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x half> %res
 }
 
-; CHECK-LABEL: @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: %res = insertelement <4 x half> poison, half %data, i64 0
-; CHECK-NEXT: ret <4 x half> %res
 define amdgpu_ps <4 x half> @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[RES:%.*]] = insertelement <4 x half> poison, half [[DATA]], i64 0
+; CHECK-NEXT:    ret <4 x half> [[RES]]
+;
   %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
   ret <4 x half> %res
 }
 
-; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret half %data
 define amdgpu_ps half @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret half [[DATA]]
+;
   %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x half> %data, i32 0
   ret half %elt0
@@ -3036,10 +3337,11 @@ declare <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32, float,
 ; llvm.amdgcn.image.sample.c
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.1d.f32.f32(i32 1, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_1d_v4f32_f32(float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3051,10 +3353,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float,
 ; llvm.amdgcn.image.sample.c.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_cl_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cl.1d.f32.f32(i32 1, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_cl_1d_v4f32_f32(float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_cl_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cl.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3066,10 +3369,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32, float, floa
 ; llvm.amdgcn.image.sample.c.d
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_d_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_d_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_d_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3081,10 +3385,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32, float, f
 ; llvm.amdgcn.image.sample.c.d.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.cl.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_d_cl_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.cl.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3096,10 +3401,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32, float
 ; llvm.amdgcn.image.sample.c.l
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_l_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.l.1d.f32.f32(i32 1, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_l_1d_v4f32_f32(float %zcompare, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_l_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.l.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3111,10 +3417,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32, float, float
 ; llvm.amdgcn.image.sample.c.b
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_b_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.1d.f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_b_1d_v4f32_f32_f32(float %bias, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_b_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.1d.f32.f32.f32(i32 1, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3126,10 +3433,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32, float, f
 ; llvm.amdgcn.image.sample.c.b.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.cl.1d.f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_b_cl_1d_v4f32_f32_f32(float %bias, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.cl.1d.f32.f32.f32(i32 1, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3141,10 +3449,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32, float
 ; llvm.amdgcn.image.sample.c.lz
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_lz_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.lz.1d.f32.f32(i32 1, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_lz_1d_v4f32_f32(float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_lz_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.lz.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3156,10 +3465,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, floa
 ; llvm.amdgcn.image.sample.c.cd
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_cd_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_cd_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_cd_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3171,10 +3481,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32, float,
 ; llvm.amdgcn.image.sample.c.cd.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.cl.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_cd_cl_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.cl.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3186,10 +3497,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32, floa
 ; llvm.amdgcn.image.sample.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_o_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.o.1d.f32.f32(i32 1, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_o_1d_v4f32_f32(i32 %offset, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_o_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3201,10 +3513,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32, i32, float, <8
 ; llvm.amdgcn.image.sample.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_cl_o_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cl.o.1d.f32.f32(i32 1, i32 %offset, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_cl_o_1d_v4f32_f32(i32 %offset, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_cl_o_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cl.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3216,10 +3529,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32, i32, float,
 ; llvm.amdgcn.image.sample.d.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_d_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_d_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_d_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3231,10 +3545,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32, i32, flo
 ; llvm.amdgcn.image.sample.d.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_d_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_d_cl_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_d_cl_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3246,10 +3561,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32, i32,
 ; llvm.amdgcn.image.sample.l.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_l_o_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.l.o.1d.f32.f32(i32 1, i32 %offset, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_l_o_1d_v4f32_f32(i32 %offset, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_l_o_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.l.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3261,10 +3577,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32, i32, float,
 ; llvm.amdgcn.image.sample.b.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_b_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.b.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_b_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_b_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3276,10 +3593,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32, i32, flo
 ; llvm.amdgcn.image.sample.b.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_b_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.b.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_b_cl_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_b_cl_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3291,10 +3609,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32, i32,
 ; llvm.amdgcn.image.sample.lz.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_lz_o_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.lz.o.1d.f32.f32(i32 1, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_lz_o_1d_v4f32_f32(i32 %offset, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_lz_o_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.lz.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3306,10 +3625,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32, i32, float,
 ; llvm.amdgcn.image.sample.cd.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_cd_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cd.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_cd_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_cd_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cd.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3321,10 +3641,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32, i32, fl
 ; llvm.amdgcn.image.sample.cd.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cd.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_cd_cl_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cd.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3336,10 +3657,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32, i32,
 ; llvm.amdgcn.image.sample.c.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_o_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_o_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3351,10 +3673,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32, i32, float,
 ; llvm.amdgcn.image.sample.c.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_cl_o_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cl.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_cl_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_cl_o_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cl.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3366,10 +3689,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32, i32, floa
 ; llvm.amdgcn.image.sample.c.d.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_d_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_d_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_d_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3381,10 +3705,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32, i32, f
 ; llvm.amdgcn.image.sample.c.d.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_d_cl_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3396,10 +3721,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32, i32
 ; llvm.amdgcn.image.sample.c.l.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_l_o_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.l.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_l_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_l_o_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.l.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3411,10 +3737,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32, i32, float
 ; llvm.amdgcn.image.sample.c.b.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_b_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_b_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_b_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3426,10 +3753,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32, i32, f
 ; llvm.amdgcn.image.sample.c.b.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_b_cl_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3441,10 +3769,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32, i32
 ; llvm.amdgcn.image.sample.c.lz.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_lz_o_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.lz.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_lz_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_lz_o_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.lz.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3456,10 +3785,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32, i32, floa
 ; llvm.amdgcn.image.sample.c.cd.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_cd_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_cd_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_cd_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3471,10 +3801,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32, i32,
 ; llvm.amdgcn.image.sample.c.cd.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_o_1d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_sample_c_cd_cl_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_o_1d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3488,9 +3819,12 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32, i3
 
 ; Don't handle gather4*
 
-; CHECK-LABEL: @extract_elt0_image_gather4_2d_v4f32_f32(
-; CHECK: %data = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_2d_v4f32_f32(float %s, float %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3502,9 +3836,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <
 ; llvm.amdgcn.image.gather4.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_cl_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 2, float %s, float %t, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_cl_2d_v4f32_f32(float %s, float %t, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_cl_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 2, float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 2, float %s, float %t, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3516,9 +3853,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32, float, float
 ; llvm.amdgcn.image.gather4.l
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_l_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 4, float %s, float %t, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_l_2d_v4f32_f32(float %s, float %t, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_l_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 4, float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 4, float %s, float %t, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3530,9 +3870,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32, float, float,
 ; llvm.amdgcn.image.gather4.b
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_b_2darray_v4f32_f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 8, float %bias, float %s, float %t, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_b_2darray_v4f32_f32_f32(float %bias, float %s, float %t, float %slice, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_b_2darray_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 8, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 8, float %bias, float %s, float %t, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3544,9 +3887,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32, floa
 ; llvm.amdgcn.image.gather4.b.cl
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_cube_v4f32_f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_b_cl_cube_v4f32_f32_f32(float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_cube_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32 1, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[FACE:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3558,9 +3904,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32, floa
 ; llvm.amdgcn.image.gather4.lz
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_lz_2d_v4f32_f16(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_lz_2d_v4f32_f16(half %s, half %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_lz_2d_v4f32_f16(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3572,9 +3921,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32, half, half,
 ; llvm.amdgcn.image.gather4.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_o_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_o_2d_v4f32_f32(i32 %offset, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_o_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3586,9 +3938,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32, i32, float, f
 ; llvm.amdgcn.image.gather4.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_cl_o_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_cl_o_2d_v4f32_f32(i32 %offset, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_cl_o_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3600,9 +3955,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32, i32, float
 ; llvm.amdgcn.image.gather4.l.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_l_o_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_l_o_2d_v4f32_f32(i32 %offset, float %s, float %t, float %lod, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_l_o_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3614,9 +3972,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32, i32, float,
 ; llvm.amdgcn.image.gather4.b.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_b_o_2d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_b_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_b_o_2d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3628,9 +3989,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32, i32, fl
 ; llvm.amdgcn.image.gather4.b.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_o_2d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_b_cl_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_o_2d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3642,9 +4006,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32, i32,
 ; llvm.amdgcn.image.gather4.lz.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_lz_o_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_lz_o_2d_v4f32_f32(i32 %offset, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_lz_o_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3656,9 +4023,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32, i32, float
 ; llvm.amdgcn.image.gather4.c.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_c_o_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_c_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_c_o_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3670,9 +4040,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32, i32, float,
 ; llvm.amdgcn.image.gather4.c.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_c_cl_o_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_c_cl_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_c_cl_o_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3684,9 +4057,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32, i32, flo
 ; llvm.amdgcn.image.gather4.c.l.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_c_l_o_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_c_l_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_c_l_o_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3698,9 +4074,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32, i32, floa
 ; llvm.amdgcn.image.gather4.c.b.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_c_b_o_2d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_c_b_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_c_b_o_2d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3712,9 +4091,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32, i32,
 ; llvm.amdgcn.image.gather4.c.b.cl.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_c_b_cl_o_2d_v4f32_f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_c_b_cl_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_c_b_cl_o_2d_v4f32_f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3726,9 +4108,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32, i3
 ; llvm.amdgcn.image.gather4.c.lz.o
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_gather4_c_lz_o_2d_v4f32_f32(
-; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 define amdgpu_ps float @extract_elt0_image_gather4_c_lz_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_gather4_c_lz_o_2d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3740,10 +4125,11 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32, i32, flo
 ; llvm.amdgcn.image.getlod
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_getlod_1d_v4f32_f32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.getlod.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_getlod_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_image_getlod_1d_v4f32_f32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.getlod.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3755,10 +4141,11 @@ declare <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32, float, <8 x i32>
 ; llvm.amdgcn.image.load
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_load_2dmsaa_v4f32_i32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 %sample, <8 x i32> %sampler, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_load_2dmsaa_v4f32_i32(i32 %s, i32 %t, i32 %sample, <8 x i32> inreg %sampler) #0 {
+; CHECK-LABEL: @extract_elt0_image_load_2dmsaa_v4f32_i32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SAMPLE:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %sample, <8 x i32> %sampler, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3770,10 +4157,11 @@ declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32,
 ; llvm.amdgcn.image.load.mip
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_load_mip_1d_v4f32_i32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.load.mip.1d.f32.i32(i32 1, i32 %s, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_load_mip_1d_v4f32_i32(i32 %s, i32 %mip, <8 x i32> inreg %sampler) #0 {
+; CHECK-LABEL: @extract_elt0_image_load_mip_1d_v4f32_i32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.load.mip.1d.f32.i32(i32 1, i32 [[S:%.*]], i32 [[MIP:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3785,10 +4173,11 @@ declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x
 ; llvm.amdgcn.image.getresinfo
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_image_getresinfo_1d_v4f32_i32(
-; CHECK-NEXT: %data = call float @llvm.amdgcn.image.getresinfo.1d.f32.i32(i32 1, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
-; CHECK-NEXT: ret float %data
 define amdgpu_ps float @extract_elt0_image_getresinfo_1d_v4f32_i32(i32 %mip, <8 x i32> inreg %sampler) #0 {
+; CHECK-LABEL: @extract_elt0_image_getresinfo_1d_v4f32_i32(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.getresinfo.1d.f32.i32(i32 1, i32 [[MIP:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
@@ -3800,9 +4189,13 @@ declare <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32, i32, <8 x i3
 ; TFE / LWE
 ; --------------------------------------------------------------------
 
-; CHECK-LABEL: @extract_elt0_tfe_image_load_1d_v4f32i32_i32(
-; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
 define amdgpu_ps float @extract_elt0_tfe_image_load_1d_v4f32i32_i32(i32 %s, <8 x i32> inreg %rsrc) #0 {
+; CHECK-LABEL: @extract_elt0_tfe_image_load_1d_v4f32i32_i32(
+; CHECK-NEXT:    [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 1)
+; CHECK-NEXT:    [[RGBA:%.*]] = extractvalue { <4 x float>, i32 } [[DATA]], 0
+; CHECK-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[RGBA]], i32 0
+; CHECK-NEXT:    ret float [[ELT0]]
+;
   %data = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
   %rgba = extractvalue { <4 x float>, i32 } %data, 0
   %elt0 = extractelement <4 x float> %rgba, i32 0
@@ -3811,10 +4204,11 @@ define amdgpu_ps float @extract_elt0_tfe_image_load_1d_v4f32i32_i32(i32 %s, <8 x
 
 declare {<4 x float>, i32} @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32, i32, <8 x i32>, i32, i32) #1
 
-; CHECK: @tfe_check_assert(
-; CHECK: %data = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1)
-; CHECK-NEXT: ret float %data
 define amdgpu_hs float @tfe_check_assert() #0 {
+; CHECK-LABEL: @tfe_check_assert(
+; CHECK-NEXT:    [[DATA:%.*]] = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1)
+; CHECK-NEXT:    ret float [[DATA]]
+;
   %data = call nsz <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1) #2
   %elt0 = extractelement <4 x float> %data, i32 0
   ret float %elt0
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll
index a4c84905b5dacc..071f322b45def7 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll
@@ -5,6 +5,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_add_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -20,6 +21,7 @@ define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_add_ss_round(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_add_ss_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -32,6 +34,7 @@ define <4 x float> @test_add_ss_round(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_add_ss_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -51,6 +54,7 @@ define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define <4 x float> @test_add_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_add_ss_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -63,6 +67,7 @@ define <4 x float> @test_add_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x
 }
 
 define float @test_add_ss_1(float %a, float %b) {
+;
 ; CHECK-LABEL: @test_add_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -82,6 +87,7 @@ define float @test_add_ss_1(float %a, float %b) {
 declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
 
 define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_add_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -95,6 +101,7 @@ define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_add_sd_round(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_add_sd_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -105,6 +112,7 @@ define <2 x double> @test_add_sd_round(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_add_sd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -122,6 +130,7 @@ define <2 x double> @test_add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define <2 x double> @test_add_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_add_sd_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -132,6 +141,7 @@ define <2 x double> @test_add_sd_mask_round(<2 x double> %a, <2 x double> %b, <2
 }
 
 define double @test_add_sd_1(double %a, double %b) {
+;
 ; CHECK-LABEL: @test_add_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -147,6 +157,7 @@ define double @test_add_sd_1(double %a, double %b) {
 declare <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_sub_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -162,6 +173,7 @@ define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_sub_ss_round(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_sub_ss_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -174,6 +186,7 @@ define <4 x float> @test_sub_ss_round(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_sub_ss_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -193,6 +206,7 @@ define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define <4 x float> @test_sub_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_sub_ss_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -205,6 +219,7 @@ define <4 x float> @test_sub_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x
 }
 
 define float @test_sub_ss_1(float %a, float %b) {
+;
 ; CHECK-LABEL: @test_sub_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -224,6 +239,7 @@ define float @test_sub_ss_1(float %a, float %b) {
 declare <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
 
 define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_sub_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -237,6 +253,7 @@ define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_sub_sd_round(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_sub_sd_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -247,6 +264,7 @@ define <2 x double> @test_sub_sd_round(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_sub_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_sub_sd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -264,6 +282,7 @@ define <2 x double> @test_sub_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define <2 x double> @test_sub_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_sub_sd_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -274,6 +293,7 @@ define <2 x double> @test_sub_sd_mask_round(<2 x double> %a, <2 x double> %b, <2
 }
 
 define double @test_sub_sd_1(double %a, double %b) {
+;
 ; CHECK-LABEL: @test_sub_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -289,6 +309,7 @@ define double @test_sub_sd_1(double %a, double %b) {
 declare <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_mul_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -304,6 +325,7 @@ define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_mul_ss_round(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_mul_ss_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -316,6 +338,7 @@ define <4 x float> @test_mul_ss_round(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mul_ss_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -335,6 +358,7 @@ define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define <4 x float> @test_mul_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mul_ss_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -347,6 +371,7 @@ define <4 x float> @test_mul_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x
 }
 
 define float @test_mul_ss_1(float %a, float %b) {
+;
 ; CHECK-LABEL: @test_mul_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -366,6 +391,7 @@ define float @test_mul_ss_1(float %a, float %b) {
 declare <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
 
 define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_mul_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -379,6 +405,7 @@ define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_mul_sd_round(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_mul_sd_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -389,6 +416,7 @@ define <2 x double> @test_mul_sd_round(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_mul_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mul_sd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -406,6 +434,7 @@ define <2 x double> @test_mul_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define <2 x double> @test_mul_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mul_sd_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -416,6 +445,7 @@ define <2 x double> @test_mul_sd_mask_round(<2 x double> %a, <2 x double> %b, <2
 }
 
 define double @test_mul_sd_1(double %a, double %b) {
+;
 ; CHECK-LABEL: @test_mul_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -431,6 +461,7 @@ define double @test_mul_sd_1(double %a, double %b) {
 declare <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_div_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -446,6 +477,7 @@ define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_div_ss_round(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_div_ss_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -458,6 +490,7 @@ define <4 x float> @test_div_ss_round(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_div_ss_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -477,6 +510,7 @@ define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define <4 x float> @test_div_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_div_ss_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -489,6 +523,7 @@ define <4 x float> @test_div_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x
 }
 
 define float @test_div_ss_1(float %a, float %b) {
+;
 ; CHECK-LABEL: @test_div_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -508,6 +543,7 @@ define float @test_div_ss_1(float %a, float %b) {
 declare <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
 
 define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_div_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -521,6 +557,7 @@ define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_div_sd_round(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_div_sd_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -531,6 +568,7 @@ define <2 x double> @test_div_sd_round(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_div_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_div_sd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -548,6 +586,7 @@ define <2 x double> @test_div_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define <2 x double> @test_div_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_div_sd_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -558,6 +597,7 @@ define <2 x double> @test_div_sd_mask_round(<2 x double> %a, <2 x double> %b, <2
 }
 
 define double @test_div_sd_1(double %a, double %b) {
+;
 ; CHECK-LABEL: @test_div_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -573,6 +613,7 @@ define double @test_div_sd_1(double %a, double %b) {
 declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_max_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 4)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -585,6 +626,7 @@ define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_max_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_max_ss_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -597,6 +639,7 @@ define <4 x float> @test_max_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define float @test_max_ss_1(float %a, float %b) {
+;
 ; CHECK-LABEL: @test_max_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -616,6 +659,7 @@ define float @test_max_ss_1(float %a, float %b) {
 declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
 
 define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_max_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 4)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -626,6 +670,7 @@ define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_max_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_max_sd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -636,6 +681,7 @@ define <2 x double> @test_max_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define double @test_max_sd_1(double %a, double %b) {
+;
 ; CHECK-LABEL: @test_max_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -651,6 +697,7 @@ define double @test_max_sd_1(double %a, double %b) {
 declare <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_min_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 4)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -663,6 +710,7 @@ define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_min_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_min_ss_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -675,6 +723,7 @@ define <4 x float> @test_min_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define float @test_min_ss_1(float %a, float %b) {
+;
 ; CHECK-LABEL: @test_min_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -694,6 +743,7 @@ define float @test_min_ss_1(float %a, float %b) {
 declare <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
 
 define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_min_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 4)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -704,6 +754,7 @@ define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_min_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_min_sd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -714,6 +765,7 @@ define <2 x double> @test_min_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define double @test_min_sd_1(double %a, double %b) {
+;
 ; CHECK-LABEL: @test_min_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -729,6 +781,7 @@ define double @test_min_sd_1(double %a, double %b) {
 declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
 
 define i8 @test_cmp_ss(<4 x float> %a, <4 x float> %b, i8 %mask) {
+;
 ; CHECK-LABEL: @test_cmp_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 3, i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    ret i8 [[TMP1]]
@@ -746,6 +799,7 @@ define i8 @test_cmp_ss(<4 x float> %a, <4 x float> %b, i8 %mask) {
 declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)
 
 define i8 @test_cmp_sd(<2 x double> %a, <2 x double> %b, i8 %mask) {
+;
 ; CHECK-LABEL: @test_cmp_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 3, i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    ret i8 [[TMP1]]
@@ -757,6 +811,7 @@ define i8 @test_cmp_sd(<2 x double> %a, <2 x double> %b, i8 %mask) {
 }
 
 define i64 @test(float %f, double %d) {
+;
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[V03:%.*]] = insertelement <4 x float> poison, float [[F:%.*]], i32 0
 ; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[V03]], i32 4)
@@ -837,6 +892,7 @@ declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32)
 declare i64 @llvm.x86.avx512.cvttsd2si64(<2 x double>, i32)
 
 define i64 @test2(float %f, double %d) {
+;
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:    [[V03:%.*]] = insertelement <4 x float> poison, float [[F:%.*]], i32 0
 ; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[V03]], i32 4)
@@ -919,6 +975,7 @@ declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32)
 declare float @llvm.fma.f32(float, float, float) #1
 
 define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask_vfmadd_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
@@ -948,6 +1005,7 @@ define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x flo
 }
 
 define float @test_mask_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask_vfmadd_ss_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -974,6 +1032,7 @@ define float @test_mask_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define float @test_mask_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask_vfmadd_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -995,6 +1054,7 @@ define float @test_mask_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float>
 declare double @llvm.fma.f64(double, double, double) #1
 
 define <2 x double> @test_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask_vfmadd_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -1020,6 +1080,7 @@ define <2 x double> @test_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x
 }
 
 define double @test_mask_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask_vfmadd_sd_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -1044,6 +1105,7 @@ define double @test_mask_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x doub
 }
 
 define double @test_mask_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask_vfmadd_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -1061,6 +1123,7 @@ define double @test_mask_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x doub
 }
 
 define <4 x float> @test_maskz_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_maskz_vfmadd_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
@@ -1090,6 +1153,7 @@ define <4 x float> @test_maskz_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x fl
 }
 
 define float @test_maskz_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_maskz_vfmadd_ss_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -1116,6 +1180,7 @@ define float @test_maskz_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define float @test_maskz_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_maskz_vfmadd_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -1135,6 +1200,7 @@ define float @test_maskz_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define <2 x double> @test_maskz_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_maskz_vfmadd_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -1160,6 +1226,7 @@ define <2 x double> @test_maskz_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x
 }
 
 define double @test_maskz_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_maskz_vfmadd_sd_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -1184,6 +1251,7 @@ define double @test_maskz_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define double @test_maskz_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_maskz_vfmadd_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -1201,6 +1269,7 @@ define double @test_maskz_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define <4 x float> @test_mask3_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmadd_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
@@ -1230,6 +1299,7 @@ define <4 x float> @test_mask3_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x fl
 }
 
 define float @test_mask3_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmadd_ss_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -1256,6 +1326,7 @@ define float @test_mask3_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define float @test_mask3_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmadd_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -1275,6 +1346,7 @@ define float @test_mask3_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define <2 x double> @test_mask3_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmadd_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -1300,6 +1372,7 @@ define <2 x double> @test_mask3_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x
 }
 
 define double @test_mask3_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmadd_sd_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -1324,6 +1397,7 @@ define double @test_mask3_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define double @test_mask3_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmadd_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -1341,6 +1415,7 @@ define double @test_mask3_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define <4 x float> @test_mask3_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmsub_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
@@ -1374,6 +1449,7 @@ define <4 x float> @test_mask3_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x fl
 }
 
 define float @test_mask3_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmsub_ss_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -1404,6 +1480,7 @@ define float @test_mask3_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define float @test_mask3_vfmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmsub_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -1425,6 +1502,7 @@ define float @test_mask3_vfmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define float @test_mask3_vfmsub_ss_1_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmsub_ss_1_unary_fneg(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -1446,6 +1524,7 @@ define float @test_mask3_vfmsub_ss_1_unary_fneg(<4 x float> %a, <4 x float> %b,
 }
 
 define <2 x double> @test_mask3_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmsub_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -1475,6 +1554,7 @@ define <2 x double> @test_mask3_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x
 }
 
 define double @test_mask3_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmsub_sd_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -1503,6 +1583,7 @@ define double @test_mask3_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define double @test_mask3_vfmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmsub_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -1522,6 +1603,7 @@ define double @test_mask3_vfmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define double @test_mask3_vfmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmsub_sd_1_unary_fneg(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -1541,6 +1623,7 @@ define double @test_mask3_vfmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double> %
 }
 
 define <4 x float> @test_mask3_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfnmsub_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = fneg float [[TMP1]]
@@ -1576,6 +1659,7 @@ define <4 x float> @test_mask3_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x f
 }
 
 define float @test_mask3_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfnmsub_ss_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = fneg float [[TMP1]]
@@ -1608,6 +1692,7 @@ define float @test_mask3_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float
 }
 
 define float @test_mask3_vfnmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfnmsub_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -1630,6 +1715,7 @@ define float @test_mask3_vfnmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float
 }
 
 define float @test_mask3_vfnmsub_ss_1_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfnmsub_ss_1_unary_fneg(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -1652,6 +1738,7 @@ define float @test_mask3_vfnmsub_ss_1_unary_fneg(<4 x float> %a, <4 x float> %b,
 }
 
 define <2 x double> @test_mask3_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfnmsub_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = fneg double [[TMP1]]
@@ -1683,6 +1770,7 @@ define <2 x double> @test_mask3_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2
 }
 
 define double @test_mask3_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfnmsub_sd_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = fneg double [[TMP1]]
@@ -1713,6 +1801,7 @@ define double @test_mask3_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x do
 }
 
 define double @test_mask3_vfnmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfnmsub_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -1733,6 +1822,7 @@ define double @test_mask3_vfnmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x do
 }
 
 define double @test_mask3_vfnmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfnmsub_sd_1_unary_fneg(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -1755,6 +1845,7 @@ define double @test_mask3_vfnmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double>
 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
 
 define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_si_256(
 ; CHECK-NEXT:    ret <8 x i32> [[A0:%.*]]
 ;
@@ -1763,6 +1854,7 @@ define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) {
 }
 
 define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_si_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[A0:%.*]], <8 x i32> [[PASSTHRU:%.*]]
@@ -1775,6 +1867,7 @@ define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %pa
 }
 
 define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_si_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
@@ -1784,6 +1877,7 @@ define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) {
 }
 
 define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_si_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -1797,6 +1891,7 @@ define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passth
 }
 
 define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_si_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
@@ -1806,6 +1901,7 @@ define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) {
 }
 
 define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_si_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -1819,6 +1915,7 @@ define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %pas
 }
 
 define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_si_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
@@ -1828,6 +1925,7 @@ define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) {
 }
 
 define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_si_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -1843,6 +1941,7 @@ define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passt
 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
 
 define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_sf_256(
 ; CHECK-NEXT:    ret <8 x float> [[A0:%.*]]
 ;
@@ -1851,6 +1950,7 @@ define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) {
 }
 
 define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_sf_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[A0:%.*]], <8 x float> [[PASSTHRU:%.*]]
@@ -1863,6 +1963,7 @@ define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x floa
 }
 
 define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_sf_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
@@ -1872,6 +1973,7 @@ define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) {
 }
 
 define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_sf_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -1885,6 +1987,7 @@ define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %
 }
 
 define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_sf_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
@@ -1894,6 +1997,7 @@ define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) {
 }
 
 define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_sf_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -1907,6 +2011,7 @@ define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float
 }
 
 define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_sf_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
@@ -1916,6 +2021,7 @@ define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) {
 }
 
 define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_sf_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -1931,6 +2037,7 @@ define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float>
 declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>)
 
 define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_di_256(
 ; CHECK-NEXT:    ret <4 x i64> [[A0:%.*]]
 ;
@@ -1939,6 +2046,7 @@ define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) {
 }
 
 define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_di_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1953,6 +2061,7 @@ define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %pa
 }
 
 define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_di_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
@@ -1962,6 +2071,7 @@ define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) {
 }
 
 define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_di_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -1977,6 +2087,7 @@ define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passth
 }
 
 define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_di_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
@@ -1986,6 +2097,7 @@ define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) {
 }
 
 define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_di_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2001,6 +2113,7 @@ define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %pas
 }
 
 define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_di_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
@@ -2010,6 +2123,7 @@ define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) {
 }
 
 define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_di_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2027,6 +2141,7 @@ define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passt
 declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>)
 
 define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_df_256(
 ; CHECK-NEXT:    ret <4 x double> [[A0:%.*]]
 ;
@@ -2035,6 +2150,7 @@ define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) {
 }
 
 define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_df_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -2049,6 +2165,7 @@ define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x do
 }
 
 define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_df_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <4 x double> [[TMP1]]
@@ -2058,6 +2175,7 @@ define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) {
 }
 
 define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_df_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2073,6 +2191,7 @@ define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double
 }
 
 define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_df_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <4 x double> [[TMP1]]
@@ -2082,6 +2201,7 @@ define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) {
 }
 
 define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_df_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2097,6 +2217,7 @@ define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x dou
 }
 
 define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_df_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <4 x double> [[TMP1]]
@@ -2106,6 +2227,7 @@ define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) {
 }
 
 define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_df_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2123,6 +2245,7 @@ define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x doubl
 declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>)
 
 define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_si_512(
 ; CHECK-NEXT:    ret <16 x i32> [[A0:%.*]]
 ;
@@ -2131,6 +2254,7 @@ define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) {
 }
 
 define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_si_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[A0:%.*]], <16 x i32> [[PASSTHRU:%.*]]
@@ -2143,6 +2267,7 @@ define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32>
 }
 
 define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_si_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
@@ -2152,6 +2277,7 @@ define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) {
 }
 
 define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_si_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2165,6 +2291,7 @@ define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %pas
 }
 
 define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_si_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
@@ -2174,6 +2301,7 @@ define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) {
 }
 
 define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_si_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2187,6 +2315,7 @@ define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %
 }
 
 define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_si_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
@@ -2196,6 +2325,7 @@ define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) {
 }
 
 define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_si_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2211,6 +2341,7 @@ define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %pa
 declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>)
 
 define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_sf_512(
 ; CHECK-NEXT:    ret <16 x float> [[A0:%.*]]
 ;
@@ -2219,6 +2350,7 @@ define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) {
 }
 
 define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_sf_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x float> [[A0:%.*]], <16 x float> [[PASSTHRU:%.*]]
@@ -2231,6 +2363,7 @@ define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x f
 }
 
 define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_sf_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -2240,6 +2373,7 @@ define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) {
 }
 
 define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_sf_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2253,6 +2387,7 @@ define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float
 }
 
 define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_sf_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -2262,6 +2397,7 @@ define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) {
 }
 
 define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_sf_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2275,6 +2411,7 @@ define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x fl
 }
 
 define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_sf_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -2284,6 +2421,7 @@ define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) {
 }
 
 define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_sf_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2299,6 +2437,7 @@ define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x floa
 declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
 
 define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_di_512(
 ; CHECK-NEXT:    ret <8 x i64> [[A0:%.*]]
 ;
@@ -2307,6 +2446,7 @@ define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) {
 }
 
 define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_di_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[A0:%.*]], <8 x i64> [[PASSTHRU:%.*]]
@@ -2319,6 +2459,7 @@ define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %pa
 }
 
 define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_di_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
@@ -2328,6 +2469,7 @@ define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) {
 }
 
 define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_di_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2341,6 +2483,7 @@ define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passth
 }
 
 define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_di_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
@@ -2350,6 +2493,7 @@ define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) {
 }
 
 define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_di_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2363,6 +2507,7 @@ define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %pas
 }
 
 define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_di_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
@@ -2372,6 +2517,7 @@ define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) {
 }
 
 define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_di_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2387,6 +2533,7 @@ define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passt
 declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
 
 define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_df_512(
 ; CHECK-NEXT:    ret <8 x double> [[A0:%.*]]
 ;
@@ -2395,6 +2542,7 @@ define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) {
 }
 
 define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_df_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x double> [[A0:%.*]], <8 x double> [[PASSTHRU:%.*]]
@@ -2407,6 +2555,7 @@ define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x do
 }
 
 define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_df_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -2416,6 +2565,7 @@ define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) {
 }
 
 define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_df_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2429,6 +2579,7 @@ define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double
 }
 
 define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_df_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -2438,6 +2589,7 @@ define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) {
 }
 
 define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_df_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2451,6 +2603,7 @@ define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x dou
 }
 
 define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_df_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -2460,6 +2613,7 @@ define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) {
 }
 
 define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_df_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2475,6 +2629,7 @@ define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x doubl
 declare <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16>, <8 x i16>)
 
 define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_hi_128(
 ; CHECK-NEXT:    ret <8 x i16> [[A0:%.*]]
 ;
@@ -2483,6 +2638,7 @@ define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) {
 }
 
 define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_hi_128_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A0:%.*]], <8 x i16> [[PASSTHRU:%.*]]
@@ -2495,6 +2651,7 @@ define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %pa
 }
 
 define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_hi_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
@@ -2504,6 +2661,7 @@ define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) {
 }
 
 define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_hi_128_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2517,6 +2675,7 @@ define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passth
 }
 
 define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_hi_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
@@ -2526,6 +2685,7 @@ define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) {
 }
 
 define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_hi_128_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2539,6 +2699,7 @@ define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %pas
 }
 
 define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_hi_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
@@ -2548,6 +2709,7 @@ define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) {
 }
 
 define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_hi_128_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2563,6 +2725,7 @@ define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passt
 declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>)
 
 define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_hi_256(
 ; CHECK-NEXT:    ret <16 x i16> [[A0:%.*]]
 ;
@@ -2571,6 +2734,7 @@ define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) {
 }
 
 define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_hi_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> [[A0:%.*]], <16 x i16> [[PASSTHRU:%.*]]
@@ -2583,6 +2747,7 @@ define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16>
 }
 
 define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_hi_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
@@ -2592,6 +2757,7 @@ define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) {
 }
 
 define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_hi_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2605,6 +2771,7 @@ define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %pas
 }
 
 define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_hi_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
@@ -2614,6 +2781,7 @@ define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) {
 }
 
 define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_hi_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2627,6 +2795,7 @@ define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %
 }
 
 define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_hi_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
@@ -2636,6 +2805,7 @@ define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) {
 }
 
 define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_hi_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2651,6 +2821,7 @@ define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %pa
 declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>)
 
 define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_hi_512(
 ; CHECK-NEXT:    ret <32 x i16> [[A0:%.*]]
 ;
@@ -2659,6 +2830,7 @@ define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) {
 }
 
 define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_hi_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i16> [[A0:%.*]], <32 x i16> [[PASSTHRU:%.*]]
@@ -2671,6 +2843,7 @@ define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16>
 }
 
 define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
@@ -2680,6 +2853,7 @@ define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) {
 }
 
 define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_hi_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
@@ -2693,6 +2867,7 @@ define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %pas
 }
 
 define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
@@ -2702,6 +2877,7 @@ define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) {
 }
 
 define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_hi_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
@@ -2715,6 +2891,7 @@ define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %
 }
 
 define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
@@ -2724,6 +2901,7 @@ define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) {
 }
 
 define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_hi_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
@@ -2739,6 +2917,7 @@ define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %pa
 declare <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8>, <16 x i8>)
 
 define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_qi_128(
 ; CHECK-NEXT:    ret <16 x i8> [[A0:%.*]]
 ;
@@ -2747,6 +2926,7 @@ define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) {
 }
 
 define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_qi_128_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A0:%.*]], <16 x i8> [[PASSTHRU:%.*]]
@@ -2759,6 +2939,7 @@ define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %pa
 }
 
 define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_qi_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
@@ -2768,6 +2949,7 @@ define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) {
 }
 
 define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_qi_128_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2781,6 +2963,7 @@ define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passth
 }
 
 define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_qi_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
@@ -2790,6 +2973,7 @@ define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) {
 }
 
 define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_qi_128_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2803,6 +2987,7 @@ define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %pas
 }
 
 define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_qi_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
@@ -2812,6 +2997,7 @@ define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) {
 }
 
 define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_qi_128_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2827,6 +3013,7 @@ define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passt
 declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>)
 
 define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_qi_256(
 ; CHECK-NEXT:    ret <32 x i8> [[A0:%.*]]
 ;
@@ -2835,6 +3022,7 @@ define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) {
 }
 
 define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_qi_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[A0:%.*]], <32 x i8> [[PASSTHRU:%.*]]
@@ -2847,6 +3035,7 @@ define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %pa
 }
 
 define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_qi_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
@@ -2856,6 +3045,7 @@ define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) {
 }
 
 define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_qi_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
@@ -2869,6 +3059,7 @@ define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passth
 }
 
 define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_qi_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
@@ -2878,6 +3069,7 @@ define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) {
 }
 
 define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_qi_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
@@ -2891,6 +3083,7 @@ define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %pas
 }
 
 define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_qi_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
@@ -2900,6 +3093,7 @@ define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) {
 }
 
 define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_qi_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
@@ -2915,6 +3109,7 @@ define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passt
 declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>)
 
 define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_qi_512(
 ; CHECK-NEXT:    ret <64 x i8> [[A0:%.*]]
 ;
@@ -2923,6 +3118,7 @@ define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) {
 }
 
 define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_qi_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <64 x i1> [[TMP1]], <64 x i8> [[A0:%.*]], <64 x i8> [[PASSTHRU:%.*]]
@@ -2935,6 +3131,7 @@ define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %pa
 }
 
 define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_qi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
@@ -2944,6 +3141,7 @@ define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) {
 }
 
 define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_qi_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
@@ -2957,6 +3155,7 @@ define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passth
 }
 
 define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_qi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
@@ -2966,6 +3165,7 @@ define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) {
 }
 
 define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_qi_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
@@ -2979,6 +3179,7 @@ define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %pas
 }
 
 define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_qi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 undef, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
@@ -2988,6 +3189,7 @@ define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) {
 }
 
 define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_qi_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 undef, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
@@ -3003,6 +3205,7 @@ define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passt
 declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32)
 
 define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) {
+;
 ; CHECK-LABEL: @test_add_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -3012,6 +3215,7 @@ define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) {
 }
 
 define <16 x float> @test_add_ps_round(<16 x float> %a, <16 x float> %b) {
+;
 ; CHECK-LABEL: @test_add_ps_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -3021,6 +3225,7 @@ define <16 x float> @test_add_ps_round(<16 x float> %a, <16 x float> %b) {
 }
 
 define <16 x float> @test_add_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+;
 ; CHECK-LABEL: @test_add_ps_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -3034,6 +3239,7 @@ define <16 x float> @test_add_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl
 }
 
 define <16 x float> @test_add_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+;
 ; CHECK-LABEL: @test_add_ps_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -3049,6 +3255,7 @@ define <16 x float> @test_add_ps_mask_round(<16 x float> %a, <16 x float> %b, <1
 declare <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double>, <8 x double>, i32)
 
 define <8 x double> @test_add_pd(<8 x double> %a, <8 x double> %b) {
+;
 ; CHECK-LABEL: @test_add_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -3058,6 +3265,7 @@ define <8 x double> @test_add_pd(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @test_add_pd_round(<8 x double> %a, <8 x double> %b) {
+;
 ; CHECK-LABEL: @test_add_pd_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -3067,6 +3275,7 @@ define <8 x double> @test_add_pd_round(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @test_add_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_add_pd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -3080,6 +3289,7 @@ define <8 x double> @test_add_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou
 }
 
 define <8 x double> @test_add_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_add_pd_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -3095,6 +3305,7 @@ define <8 x double> @test_add_pd_mask_round(<8 x double> %a, <8 x double> %b, <8
 declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32)
 
 define <16 x float> @test_sub_ps(<16 x float> %a, <16 x float> %b) {
+;
 ; CHECK-LABEL: @test_sub_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -3104,6 +3315,7 @@ define <16 x float> @test_sub_ps(<16 x float> %a, <16 x float> %b) {
 }
 
 define <16 x float> @test_sub_ps_round(<16 x float> %a, <16 x float> %b) {
+;
 ; CHECK-LABEL: @test_sub_ps_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -3113,6 +3325,7 @@ define <16 x float> @test_sub_ps_round(<16 x float> %a, <16 x float> %b) {
 }
 
 define <16 x float> @test_sub_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+;
 ; CHECK-LABEL: @test_sub_ps_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -3126,6 +3339,7 @@ define <16 x float> @test_sub_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl
 }
 
 define <16 x float> @test_sub_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+;
 ; CHECK-LABEL: @test_sub_ps_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -3141,6 +3355,7 @@ define <16 x float> @test_sub_ps_mask_round(<16 x float> %a, <16 x float> %b, <1
 declare <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double>, <8 x double>, i32)
 
 define <8 x double> @test_sub_pd(<8 x double> %a, <8 x double> %b) {
+;
 ; CHECK-LABEL: @test_sub_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -3150,6 +3365,7 @@ define <8 x double> @test_sub_pd(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @test_sub_pd_round(<8 x double> %a, <8 x double> %b) {
+;
 ; CHECK-LABEL: @test_sub_pd_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -3159,6 +3375,7 @@ define <8 x double> @test_sub_pd_round(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @test_sub_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_sub_pd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -3172,6 +3389,7 @@ define <8 x double> @test_sub_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou
 }
 
 define <8 x double> @test_sub_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_sub_pd_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -3187,6 +3405,7 @@ define <8 x double> @test_sub_pd_mask_round(<8 x double> %a, <8 x double> %b, <8
 declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32)
 
 define <16 x float> @test_mul_ps(<16 x float> %a, <16 x float> %b) {
+;
 ; CHECK-LABEL: @test_mul_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -3196,6 +3415,7 @@ define <16 x float> @test_mul_ps(<16 x float> %a, <16 x float> %b) {
 }
 
 define <16 x float> @test_mul_ps_round(<16 x float> %a, <16 x float> %b) {
+;
 ; CHECK-LABEL: @test_mul_ps_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -3205,6 +3425,7 @@ define <16 x float> @test_mul_ps_round(<16 x float> %a, <16 x float> %b) {
 }
 
 define <16 x float> @test_mul_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+;
 ; CHECK-LABEL: @test_mul_ps_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -3218,6 +3439,7 @@ define <16 x float> @test_mul_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl
 }
 
 define <16 x float> @test_mul_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+;
 ; CHECK-LABEL: @test_mul_ps_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -3233,6 +3455,7 @@ define <16 x float> @test_mul_ps_mask_round(<16 x float> %a, <16 x float> %b, <1
 declare <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double>, <8 x double>, i32)
 
 define <8 x double> @test_mul_pd(<8 x double> %a, <8 x double> %b) {
+;
 ; CHECK-LABEL: @test_mul_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -3242,6 +3465,7 @@ define <8 x double> @test_mul_pd(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @test_mul_pd_round(<8 x double> %a, <8 x double> %b) {
+;
 ; CHECK-LABEL: @test_mul_pd_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -3251,6 +3475,7 @@ define <8 x double> @test_mul_pd_round(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @test_mul_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mul_pd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -3264,6 +3489,7 @@ define <8 x double> @test_mul_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou
 }
 
 define <8 x double> @test_mul_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mul_pd_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -3279,6 +3505,7 @@ define <8 x double> @test_mul_pd_mask_round(<8 x double> %a, <8 x double> %b, <8
 declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32)
 
 define <16 x float> @test_div_ps(<16 x float> %a, <16 x float> %b) {
+;
 ; CHECK-LABEL: @test_div_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -3288,6 +3515,7 @@ define <16 x float> @test_div_ps(<16 x float> %a, <16 x float> %b) {
 }
 
 define <16 x float> @test_div_ps_round(<16 x float> %a, <16 x float> %b) {
+;
 ; CHECK-LABEL: @test_div_ps_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -3297,6 +3525,7 @@ define <16 x float> @test_div_ps_round(<16 x float> %a, <16 x float> %b) {
 }
 
 define <16 x float> @test_div_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+;
 ; CHECK-LABEL: @test_div_ps_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -3310,6 +3539,7 @@ define <16 x float> @test_div_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl
 }
 
 define <16 x float> @test_div_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+;
 ; CHECK-LABEL: @test_div_ps_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -3325,6 +3555,7 @@ define <16 x float> @test_div_ps_mask_round(<16 x float> %a, <16 x float> %b, <1
 declare <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double>, <8 x double>, i32)
 
 define <8 x double> @test_div_pd(<8 x double> %a, <8 x double> %b) {
+;
 ; CHECK-LABEL: @test_div_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -3334,6 +3565,7 @@ define <8 x double> @test_div_pd(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @test_div_pd_round(<8 x double> %a, <8 x double> %b) {
+;
 ; CHECK-LABEL: @test_div_pd_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -3343,6 +3575,7 @@ define <8 x double> @test_div_pd_round(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @test_div_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_div_pd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -3356,6 +3589,7 @@ define <8 x double> @test_div_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou
 }
 
 define <8 x double> @test_div_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_div_pd_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -3371,6 +3605,7 @@ define <8 x double> @test_div_pd_mask_round(<8 x double> %a, <8 x double> %b, <8
 declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
 
 define i32 @test_comi_ss_0(float %a, float %b) {
+;
 ; CHECK-LABEL: @test_comi_ss_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i32 0
@@ -3392,6 +3627,7 @@ define i32 @test_comi_ss_0(float %a, float %b) {
 declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
 
 define i32 @test_comi_sd_0(double %a, double %b) {
+;
 ; CHECK-LABEL: @test_comi_sd_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[B:%.*]], i32 0
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
index 7772946751c5c5..dc735e0b490851 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll
@@ -5,6 +5,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_add_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -20,6 +21,7 @@ define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_add_ss_round(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_add_ss_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -32,6 +34,7 @@ define <4 x float> @test_add_ss_round(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_add_ss_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -51,6 +54,7 @@ define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define <4 x float> @test_add_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_add_ss_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -63,6 +67,7 @@ define <4 x float> @test_add_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x
 }
 
 define float @test_add_ss_1(float %a, float %b) {
+;
 ; CHECK-LABEL: @test_add_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -82,6 +87,7 @@ define float @test_add_ss_1(float %a, float %b) {
 declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
 
 define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_add_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -95,6 +101,7 @@ define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_add_sd_round(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_add_sd_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -105,6 +112,7 @@ define <2 x double> @test_add_sd_round(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_add_sd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -122,6 +130,7 @@ define <2 x double> @test_add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define <2 x double> @test_add_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_add_sd_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -132,6 +141,7 @@ define <2 x double> @test_add_sd_mask_round(<2 x double> %a, <2 x double> %b, <2
 }
 
 define double @test_add_sd_1(double %a, double %b) {
+;
 ; CHECK-LABEL: @test_add_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -147,6 +157,7 @@ define double @test_add_sd_1(double %a, double %b) {
 declare <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_sub_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -162,6 +173,7 @@ define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_sub_ss_round(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_sub_ss_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -174,6 +186,7 @@ define <4 x float> @test_sub_ss_round(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_sub_ss_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -193,6 +206,7 @@ define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define <4 x float> @test_sub_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_sub_ss_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -205,6 +219,7 @@ define <4 x float> @test_sub_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x
 }
 
 define float @test_sub_ss_1(float %a, float %b) {
+;
 ; CHECK-LABEL: @test_sub_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -224,6 +239,7 @@ define float @test_sub_ss_1(float %a, float %b) {
 declare <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
 
 define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_sub_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -237,6 +253,7 @@ define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_sub_sd_round(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_sub_sd_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -247,6 +264,7 @@ define <2 x double> @test_sub_sd_round(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_sub_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_sub_sd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -264,6 +282,7 @@ define <2 x double> @test_sub_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define <2 x double> @test_sub_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_sub_sd_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -274,6 +293,7 @@ define <2 x double> @test_sub_sd_mask_round(<2 x double> %a, <2 x double> %b, <2
 }
 
 define double @test_sub_sd_1(double %a, double %b) {
+;
 ; CHECK-LABEL: @test_sub_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -289,6 +309,7 @@ define double @test_sub_sd_1(double %a, double %b) {
 declare <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_mul_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -304,6 +325,7 @@ define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_mul_ss_round(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_mul_ss_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -316,6 +338,7 @@ define <4 x float> @test_mul_ss_round(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mul_ss_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -335,6 +358,7 @@ define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define <4 x float> @test_mul_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mul_ss_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -347,6 +371,7 @@ define <4 x float> @test_mul_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x
 }
 
 define float @test_mul_ss_1(float %a, float %b) {
+;
 ; CHECK-LABEL: @test_mul_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -366,6 +391,7 @@ define float @test_mul_ss_1(float %a, float %b) {
 declare <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
 
 define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_mul_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -379,6 +405,7 @@ define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_mul_sd_round(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_mul_sd_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -389,6 +416,7 @@ define <2 x double> @test_mul_sd_round(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_mul_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mul_sd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -406,6 +434,7 @@ define <2 x double> @test_mul_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define <2 x double> @test_mul_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mul_sd_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -416,6 +445,7 @@ define <2 x double> @test_mul_sd_mask_round(<2 x double> %a, <2 x double> %b, <2
 }
 
 define double @test_mul_sd_1(double %a, double %b) {
+;
 ; CHECK-LABEL: @test_mul_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -431,6 +461,7 @@ define double @test_mul_sd_1(double %a, double %b) {
 declare <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_div_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -446,6 +477,7 @@ define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_div_ss_round(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_div_ss_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -458,6 +490,7 @@ define <4 x float> @test_div_ss_round(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_div_ss_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -477,6 +510,7 @@ define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define <4 x float> @test_div_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_div_ss_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -489,6 +523,7 @@ define <4 x float> @test_div_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x
 }
 
 define float @test_div_ss_1(float %a, float %b) {
+;
 ; CHECK-LABEL: @test_div_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -508,6 +543,7 @@ define float @test_div_ss_1(float %a, float %b) {
 declare <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
 
 define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_div_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -521,6 +557,7 @@ define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_div_sd_round(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_div_sd_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -531,6 +568,7 @@ define <2 x double> @test_div_sd_round(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_div_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_div_sd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -548,6 +586,7 @@ define <2 x double> @test_div_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define <2 x double> @test_div_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_div_sd_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -558,6 +597,7 @@ define <2 x double> @test_div_sd_mask_round(<2 x double> %a, <2 x double> %b, <2
 }
 
 define double @test_div_sd_1(double %a, double %b) {
+;
 ; CHECK-LABEL: @test_div_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -573,6 +613,7 @@ define double @test_div_sd_1(double %a, double %b) {
 declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_max_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 4)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -585,6 +626,7 @@ define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_max_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_max_ss_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -597,6 +639,7 @@ define <4 x float> @test_max_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define float @test_max_ss_1(float %a, float %b) {
+;
 ; CHECK-LABEL: @test_max_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -616,6 +659,7 @@ define float @test_max_ss_1(float %a, float %b) {
 declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
 
 define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_max_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 4)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -626,6 +670,7 @@ define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_max_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_max_sd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -636,6 +681,7 @@ define <2 x double> @test_max_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define double @test_max_sd_1(double %a, double %b) {
+;
 ; CHECK-LABEL: @test_max_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -651,6 +697,7 @@ define double @test_max_sd_1(double %a, double %b) {
 declare <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) {
+;
 ; CHECK-LABEL: @test_min_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 4)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -663,6 +710,7 @@ define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @test_min_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_min_ss_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
@@ -675,6 +723,7 @@ define <4 x float> @test_min_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define float @test_min_ss_1(float %a, float %b) {
+;
 ; CHECK-LABEL: @test_min_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -694,6 +743,7 @@ define float @test_min_ss_1(float %a, float %b) {
 declare <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
 
 define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) {
+;
 ; CHECK-LABEL: @test_min_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 4)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -704,6 +754,7 @@ define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @test_min_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_min_sd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
@@ -714,6 +765,7 @@ define <2 x double> @test_min_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define double @test_min_sd_1(double %a, double %b) {
+;
 ; CHECK-LABEL: @test_min_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -729,6 +781,7 @@ define double @test_min_sd_1(double %a, double %b) {
 declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
 
 define i8 @test_cmp_ss(<4 x float> %a, <4 x float> %b, i8 %mask) {
+;
 ; CHECK-LABEL: @test_cmp_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 3, i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    ret i8 [[TMP1]]
@@ -746,6 +799,7 @@ define i8 @test_cmp_ss(<4 x float> %a, <4 x float> %b, i8 %mask) {
 declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)
 
 define i8 @test_cmp_sd(<2 x double> %a, <2 x double> %b, i8 %mask) {
+;
 ; CHECK-LABEL: @test_cmp_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 3, i8 [[MASK:%.*]], i32 4)
 ; CHECK-NEXT:    ret i8 [[TMP1]]
@@ -757,6 +811,7 @@ define i8 @test_cmp_sd(<2 x double> %a, <2 x double> %b, i8 %mask) {
 }
 
 define i64 @test(float %f, double %d) {
+;
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[V03:%.*]] = insertelement <4 x float> poison, float [[F:%.*]], i32 0
 ; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[V03]], i32 4)
@@ -837,6 +892,7 @@ declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32)
 declare i64 @llvm.x86.avx512.cvttsd2si64(<2 x double>, i32)
 
 define i64 @test2(float %f, double %d) {
+;
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:    [[V03:%.*]] = insertelement <4 x float> poison, float [[F:%.*]], i32 0
 ; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[V03]], i32 4)
@@ -919,6 +975,7 @@ declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32)
 declare float @llvm.fma.f32(float, float, float) #1
 
 define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask_vfmadd_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
@@ -948,6 +1005,7 @@ define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x flo
 }
 
 define float @test_mask_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask_vfmadd_ss_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -974,6 +1032,7 @@ define float @test_mask_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define float @test_mask_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask_vfmadd_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -995,6 +1054,7 @@ define float @test_mask_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float>
 declare double @llvm.fma.f64(double, double, double) #1
 
 define <2 x double> @test_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask_vfmadd_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -1020,6 +1080,7 @@ define <2 x double> @test_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x
 }
 
 define double @test_mask_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask_vfmadd_sd_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -1044,6 +1105,7 @@ define double @test_mask_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x doub
 }
 
 define double @test_mask_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask_vfmadd_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -1061,6 +1123,7 @@ define double @test_mask_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x doub
 }
 
 define <4 x float> @test_maskz_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_maskz_vfmadd_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
@@ -1090,6 +1153,7 @@ define <4 x float> @test_maskz_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x fl
 }
 
 define float @test_maskz_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_maskz_vfmadd_ss_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -1116,6 +1180,7 @@ define float @test_maskz_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define float @test_maskz_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_maskz_vfmadd_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -1135,6 +1200,7 @@ define float @test_maskz_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define <2 x double> @test_maskz_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_maskz_vfmadd_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -1160,6 +1226,7 @@ define <2 x double> @test_maskz_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x
 }
 
 define double @test_maskz_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_maskz_vfmadd_sd_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -1184,6 +1251,7 @@ define double @test_maskz_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define double @test_maskz_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_maskz_vfmadd_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -1201,6 +1269,7 @@ define double @test_maskz_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define <4 x float> @test_mask3_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmadd_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
@@ -1230,6 +1299,7 @@ define <4 x float> @test_mask3_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x fl
 }
 
 define float @test_mask3_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmadd_ss_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -1256,6 +1326,7 @@ define float @test_mask3_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define float @test_mask3_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmadd_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -1275,6 +1346,7 @@ define float @test_mask3_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define <2 x double> @test_mask3_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmadd_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -1300,6 +1372,7 @@ define <2 x double> @test_mask3_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x
 }
 
 define double @test_mask3_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmadd_sd_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -1324,6 +1397,7 @@ define double @test_mask3_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define double @test_mask3_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmadd_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -1341,6 +1415,7 @@ define double @test_mask3_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define <4 x float> @test_mask3_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmsub_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
@@ -1374,6 +1449,7 @@ define <4 x float> @test_mask3_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x fl
 }
 
 define float @test_mask3_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmsub_ss_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -1404,6 +1480,7 @@ define float @test_mask3_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define float @test_mask3_vfmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmsub_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -1425,6 +1502,7 @@ define float @test_mask3_vfmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float>
 }
 
 define float @test_mask3_vfmsub_ss_1_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmsub_ss_1_unary_fneg(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -1446,6 +1524,7 @@ define float @test_mask3_vfmsub_ss_1_unary_fneg(<4 x float> %a, <4 x float> %b,
 }
 
 define <2 x double> @test_mask3_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmsub_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -1475,6 +1554,7 @@ define <2 x double> @test_mask3_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x
 }
 
 define double @test_mask3_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmsub_sd_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -1503,6 +1583,7 @@ define double @test_mask3_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define double @test_mask3_vfmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmsub_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -1522,6 +1603,7 @@ define double @test_mask3_vfmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x dou
 }
 
 define double @test_mask3_vfmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfmsub_sd_1_unary_fneg(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -1541,6 +1623,7 @@ define double @test_mask3_vfmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double> %
 }
 
 define <4 x float> @test_mask3_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfnmsub_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = fneg float [[TMP1]]
@@ -1576,6 +1659,7 @@ define <4 x float> @test_mask3_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x f
 }
 
 define float @test_mask3_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfnmsub_ss_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = fneg float [[TMP1]]
@@ -1608,6 +1692,7 @@ define float @test_mask3_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float
 }
 
 define float @test_mask3_vfnmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfnmsub_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -1630,6 +1715,7 @@ define float @test_mask3_vfnmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float
 }
 
 define float @test_mask3_vfnmsub_ss_1_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfnmsub_ss_1_unary_fneg(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -1652,6 +1738,7 @@ define float @test_mask3_vfnmsub_ss_1_unary_fneg(<4 x float> %a, <4 x float> %b,
 }
 
 define <2 x double> @test_mask3_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfnmsub_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = fneg double [[TMP1]]
@@ -1683,6 +1770,7 @@ define <2 x double> @test_mask3_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2
 }
 
 define double @test_mask3_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfnmsub_sd_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = fneg double [[TMP1]]
@@ -1713,6 +1801,7 @@ define double @test_mask3_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x do
 }
 
 define double @test_mask3_vfnmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfnmsub_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -1733,6 +1822,7 @@ define double @test_mask3_vfnmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x do
 }
 
 define double @test_mask3_vfnmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mask3_vfnmsub_sd_1_unary_fneg(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
@@ -1755,6 +1845,7 @@ define double @test_mask3_vfnmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double>
 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
 
 define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_si_256(
 ; CHECK-NEXT:    ret <8 x i32> [[A0:%.*]]
 ;
@@ -1763,6 +1854,7 @@ define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) {
 }
 
 define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_si_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[A0:%.*]], <8 x i32> [[PASSTHRU:%.*]]
@@ -1775,6 +1867,7 @@ define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %pa
 }
 
 define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_si_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
@@ -1784,6 +1877,7 @@ define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) {
 }
 
 define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_si_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -1797,6 +1891,7 @@ define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passth
 }
 
 define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_si_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
@@ -1806,6 +1901,7 @@ define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) {
 }
 
 define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_si_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -1819,6 +1915,7 @@ define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %pas
 }
 
 define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_si_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
@@ -1828,6 +1925,7 @@ define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) {
 }
 
 define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_si_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -1843,6 +1941,7 @@ define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passt
 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
 
 define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_sf_256(
 ; CHECK-NEXT:    ret <8 x float> [[A0:%.*]]
 ;
@@ -1851,6 +1950,7 @@ define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) {
 }
 
 define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_sf_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[A0:%.*]], <8 x float> [[PASSTHRU:%.*]]
@@ -1863,6 +1963,7 @@ define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x floa
 }
 
 define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_sf_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
@@ -1872,6 +1973,7 @@ define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) {
 }
 
 define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_sf_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -1885,6 +1987,7 @@ define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %
 }
 
 define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_sf_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
@@ -1894,6 +1997,7 @@ define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) {
 }
 
 define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_sf_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -1907,6 +2011,7 @@ define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float
 }
 
 define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_sf_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
@@ -1916,6 +2021,7 @@ define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) {
 }
 
 define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_sf_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -1931,6 +2037,7 @@ define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float>
 declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>)
 
 define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_di_256(
 ; CHECK-NEXT:    ret <4 x i64> [[A0:%.*]]
 ;
@@ -1939,6 +2046,7 @@ define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) {
 }
 
 define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_di_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1953,6 +2061,7 @@ define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %pa
 }
 
 define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_di_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
@@ -1962,6 +2071,7 @@ define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) {
 }
 
 define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_di_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -1977,6 +2087,7 @@ define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passth
 }
 
 define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_di_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
@@ -1986,6 +2097,7 @@ define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) {
 }
 
 define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_di_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2001,6 +2113,7 @@ define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %pas
 }
 
 define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_di_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
@@ -2010,6 +2123,7 @@ define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) {
 }
 
 define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_di_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2027,6 +2141,7 @@ define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passt
 declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>)
 
 define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_df_256(
 ; CHECK-NEXT:    ret <4 x double> [[A0:%.*]]
 ;
@@ -2035,6 +2150,7 @@ define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) {
 }
 
 define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_df_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -2049,6 +2165,7 @@ define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x do
 }
 
 define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_df_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <4 x double> [[TMP1]]
@@ -2058,6 +2175,7 @@ define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) {
 }
 
 define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_df_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2073,6 +2191,7 @@ define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double
 }
 
 define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_df_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <4 x double> [[TMP1]]
@@ -2082,6 +2201,7 @@ define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) {
 }
 
 define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_df_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2097,6 +2217,7 @@ define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x dou
 }
 
 define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_df_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <4 x double> [[TMP1]]
@@ -2106,6 +2227,7 @@ define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) {
 }
 
 define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_df_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2123,6 +2245,7 @@ define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x doubl
 declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>)
 
 define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_si_512(
 ; CHECK-NEXT:    ret <16 x i32> [[A0:%.*]]
 ;
@@ -2131,6 +2254,7 @@ define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) {
 }
 
 define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_si_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[A0:%.*]], <16 x i32> [[PASSTHRU:%.*]]
@@ -2143,6 +2267,7 @@ define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32>
 }
 
 define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_si_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
@@ -2152,6 +2277,7 @@ define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) {
 }
 
 define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_si_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2165,6 +2291,7 @@ define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %pas
 }
 
 define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_si_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
@@ -2174,6 +2301,7 @@ define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) {
 }
 
 define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_si_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2187,6 +2315,7 @@ define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %
 }
 
 define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_si_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
@@ -2196,6 +2325,7 @@ define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) {
 }
 
 define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_si_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2211,6 +2341,7 @@ define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %pa
 declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>)
 
 define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_sf_512(
 ; CHECK-NEXT:    ret <16 x float> [[A0:%.*]]
 ;
@@ -2219,6 +2350,7 @@ define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) {
 }
 
 define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_sf_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x float> [[A0:%.*]], <16 x float> [[PASSTHRU:%.*]]
@@ -2231,6 +2363,7 @@ define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x f
 }
 
 define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_sf_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -2240,6 +2373,7 @@ define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) {
 }
 
 define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_sf_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2253,6 +2387,7 @@ define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float
 }
 
 define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_sf_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -2262,6 +2397,7 @@ define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) {
 }
 
 define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_sf_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2275,6 +2411,7 @@ define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x fl
 }
 
 define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_sf_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -2284,6 +2421,7 @@ define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) {
 }
 
 define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_sf_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2299,6 +2437,7 @@ define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x floa
 declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
 
 define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_di_512(
 ; CHECK-NEXT:    ret <8 x i64> [[A0:%.*]]
 ;
@@ -2307,6 +2446,7 @@ define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) {
 }
 
 define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_di_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[A0:%.*]], <8 x i64> [[PASSTHRU:%.*]]
@@ -2319,6 +2459,7 @@ define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %pa
 }
 
 define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_di_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
@@ -2328,6 +2469,7 @@ define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) {
 }
 
 define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_di_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2341,6 +2483,7 @@ define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passth
 }
 
 define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_di_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
@@ -2350,6 +2493,7 @@ define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) {
 }
 
 define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_di_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2363,6 +2507,7 @@ define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %pas
 }
 
 define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_di_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
@@ -2372,6 +2517,7 @@ define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) {
 }
 
 define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_di_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2387,6 +2533,7 @@ define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passt
 declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
 
 define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_df_512(
 ; CHECK-NEXT:    ret <8 x double> [[A0:%.*]]
 ;
@@ -2395,6 +2542,7 @@ define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) {
 }
 
 define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_df_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x double> [[A0:%.*]], <8 x double> [[PASSTHRU:%.*]]
@@ -2407,6 +2555,7 @@ define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x do
 }
 
 define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_df_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -2416,6 +2565,7 @@ define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) {
 }
 
 define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_df_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2429,6 +2579,7 @@ define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double
 }
 
 define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_df_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -2438,6 +2589,7 @@ define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) {
 }
 
 define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_df_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2451,6 +2603,7 @@ define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x dou
 }
 
 define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_df_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -2460,6 +2613,7 @@ define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) {
 }
 
 define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_df_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2475,6 +2629,7 @@ define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x doubl
 declare <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16>, <8 x i16>)
 
 define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_hi_128(
 ; CHECK-NEXT:    ret <8 x i16> [[A0:%.*]]
 ;
@@ -2483,6 +2638,7 @@ define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) {
 }
 
 define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_hi_128_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A0:%.*]], <8 x i16> [[PASSTHRU:%.*]]
@@ -2495,6 +2651,7 @@ define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %pa
 }
 
 define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_hi_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
@@ -2504,6 +2661,7 @@ define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) {
 }
 
 define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_hi_128_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2517,6 +2675,7 @@ define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passth
 }
 
 define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_hi_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
@@ -2526,6 +2685,7 @@ define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) {
 }
 
 define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_hi_128_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2539,6 +2699,7 @@ define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %pas
 }
 
 define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_hi_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
@@ -2548,6 +2709,7 @@ define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) {
 }
 
 define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_hi_128_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -2563,6 +2725,7 @@ define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passt
 declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>)
 
 define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_hi_256(
 ; CHECK-NEXT:    ret <16 x i16> [[A0:%.*]]
 ;
@@ -2571,6 +2734,7 @@ define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) {
 }
 
 define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_hi_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> [[A0:%.*]], <16 x i16> [[PASSTHRU:%.*]]
@@ -2583,6 +2747,7 @@ define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16>
 }
 
 define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_hi_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
@@ -2592,6 +2757,7 @@ define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) {
 }
 
 define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_hi_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2605,6 +2771,7 @@ define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %pas
 }
 
 define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_hi_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
@@ -2614,6 +2781,7 @@ define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) {
 }
 
 define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_hi_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2627,6 +2795,7 @@ define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %
 }
 
 define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_hi_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
@@ -2636,6 +2805,7 @@ define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) {
 }
 
 define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_hi_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2651,6 +2821,7 @@ define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %pa
 declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>)
 
 define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_hi_512(
 ; CHECK-NEXT:    ret <32 x i16> [[A0:%.*]]
 ;
@@ -2659,6 +2830,7 @@ define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) {
 }
 
 define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_hi_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i16> [[A0:%.*]], <32 x i16> [[PASSTHRU:%.*]]
@@ -2671,6 +2843,7 @@ define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16>
 }
 
 define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
@@ -2680,6 +2853,7 @@ define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) {
 }
 
 define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_hi_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
@@ -2693,6 +2867,7 @@ define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %pas
 }
 
 define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
@@ -2702,6 +2877,7 @@ define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) {
 }
 
 define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_hi_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
@@ -2715,6 +2891,7 @@ define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %
 }
 
 define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
@@ -2724,6 +2901,7 @@ define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) {
 }
 
 define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_hi_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
@@ -2739,6 +2917,7 @@ define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %pa
 declare <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8>, <16 x i8>)
 
 define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_qi_128(
 ; CHECK-NEXT:    ret <16 x i8> [[A0:%.*]]
 ;
@@ -2747,6 +2926,7 @@ define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) {
 }
 
 define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_qi_128_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A0:%.*]], <16 x i8> [[PASSTHRU:%.*]]
@@ -2759,6 +2939,7 @@ define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %pa
 }
 
 define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_qi_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
@@ -2768,6 +2949,7 @@ define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) {
 }
 
 define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_qi_128_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2781,6 +2963,7 @@ define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passth
 }
 
 define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_qi_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
@@ -2790,6 +2973,7 @@ define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) {
 }
 
 define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_qi_128_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2803,6 +2987,7 @@ define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %pas
 }
 
 define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_qi_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
@@ -2812,6 +2997,7 @@ define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) {
 }
 
 define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_qi_128_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -2827,6 +3013,7 @@ define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passt
 declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>)
 
 define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_qi_256(
 ; CHECK-NEXT:    ret <32 x i8> [[A0:%.*]]
 ;
@@ -2835,6 +3022,7 @@ define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) {
 }
 
 define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_qi_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[A0:%.*]], <32 x i8> [[PASSTHRU:%.*]]
@@ -2847,6 +3035,7 @@ define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %pa
 }
 
 define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_qi_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
@@ -2856,6 +3045,7 @@ define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) {
 }
 
 define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_qi_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
@@ -2869,6 +3059,7 @@ define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passth
 }
 
 define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_qi_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
@@ -2878,6 +3069,7 @@ define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) {
 }
 
 define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_qi_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
@@ -2891,6 +3083,7 @@ define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %pas
 }
 
 define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_qi_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
@@ -2900,6 +3093,7 @@ define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) {
 }
 
 define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_qi_256_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
@@ -2915,6 +3109,7 @@ define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passt
 declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>)
 
 define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) {
+;
 ; CHECK-LABEL: @identity_test_permvar_qi_512(
 ; CHECK-NEXT:    ret <64 x i8> [[A0:%.*]]
 ;
@@ -2923,6 +3118,7 @@ define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) {
 }
 
 define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+;
 ; CHECK-LABEL: @identity_test_permvar_qi_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <64 x i1> [[TMP1]], <64 x i8> [[A0:%.*]], <64 x i8> [[PASSTHRU:%.*]]
@@ -2935,6 +3131,7 @@ define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %pa
 }
 
 define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) {
+;
 ; CHECK-LABEL: @zero_test_permvar_qi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
@@ -2944,6 +3141,7 @@ define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) {
 }
 
 define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+;
 ; CHECK-LABEL: @zero_test_permvar_qi_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
@@ -2957,6 +3155,7 @@ define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passth
 }
 
 define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_qi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
@@ -2966,6 +3165,7 @@ define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) {
 }
 
 define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+;
 ; CHECK-LABEL: @shuffle_test_permvar_qi_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
@@ -2979,6 +3179,7 @@ define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %pas
 }
 
 define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) {
+;
 ; CHECK-LABEL: @undef_test_permvar_qi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 undef, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
@@ -2988,6 +3189,7 @@ define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) {
 }
 
 define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+;
 ; CHECK-LABEL: @undef_test_permvar_qi_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> <i32 undef, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
@@ -3003,6 +3205,7 @@ define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passt
 declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32)
 
 define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) {
+;
 ; CHECK-LABEL: @test_add_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -3012,6 +3215,7 @@ define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) {
 }
 
 define <16 x float> @test_add_ps_round(<16 x float> %a, <16 x float> %b) {
+;
 ; CHECK-LABEL: @test_add_ps_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -3021,6 +3225,7 @@ define <16 x float> @test_add_ps_round(<16 x float> %a, <16 x float> %b) {
 }
 
 define <16 x float> @test_add_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+;
 ; CHECK-LABEL: @test_add_ps_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -3034,6 +3239,7 @@ define <16 x float> @test_add_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl
 }
 
 define <16 x float> @test_add_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+;
 ; CHECK-LABEL: @test_add_ps_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -3049,6 +3255,7 @@ define <16 x float> @test_add_ps_mask_round(<16 x float> %a, <16 x float> %b, <1
 declare <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double>, <8 x double>, i32)
 
 define <8 x double> @test_add_pd(<8 x double> %a, <8 x double> %b) {
+;
 ; CHECK-LABEL: @test_add_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -3058,6 +3265,7 @@ define <8 x double> @test_add_pd(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @test_add_pd_round(<8 x double> %a, <8 x double> %b) {
+;
 ; CHECK-LABEL: @test_add_pd_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -3067,6 +3275,7 @@ define <8 x double> @test_add_pd_round(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @test_add_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_add_pd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -3080,6 +3289,7 @@ define <8 x double> @test_add_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou
 }
 
 define <8 x double> @test_add_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_add_pd_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -3095,6 +3305,7 @@ define <8 x double> @test_add_pd_mask_round(<8 x double> %a, <8 x double> %b, <8
 declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32)
 
 define <16 x float> @test_sub_ps(<16 x float> %a, <16 x float> %b) {
+;
 ; CHECK-LABEL: @test_sub_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -3104,6 +3315,7 @@ define <16 x float> @test_sub_ps(<16 x float> %a, <16 x float> %b) {
 }
 
 define <16 x float> @test_sub_ps_round(<16 x float> %a, <16 x float> %b) {
+;
 ; CHECK-LABEL: @test_sub_ps_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -3113,6 +3325,7 @@ define <16 x float> @test_sub_ps_round(<16 x float> %a, <16 x float> %b) {
 }
 
 define <16 x float> @test_sub_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+;
 ; CHECK-LABEL: @test_sub_ps_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -3126,6 +3339,7 @@ define <16 x float> @test_sub_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl
 }
 
 define <16 x float> @test_sub_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+;
 ; CHECK-LABEL: @test_sub_ps_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -3141,6 +3355,7 @@ define <16 x float> @test_sub_ps_mask_round(<16 x float> %a, <16 x float> %b, <1
 declare <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double>, <8 x double>, i32)
 
 define <8 x double> @test_sub_pd(<8 x double> %a, <8 x double> %b) {
+;
 ; CHECK-LABEL: @test_sub_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -3150,6 +3365,7 @@ define <8 x double> @test_sub_pd(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @test_sub_pd_round(<8 x double> %a, <8 x double> %b) {
+;
 ; CHECK-LABEL: @test_sub_pd_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -3159,6 +3375,7 @@ define <8 x double> @test_sub_pd_round(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @test_sub_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_sub_pd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -3172,6 +3389,7 @@ define <8 x double> @test_sub_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou
 }
 
 define <8 x double> @test_sub_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_sub_pd_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -3187,6 +3405,7 @@ define <8 x double> @test_sub_pd_mask_round(<8 x double> %a, <8 x double> %b, <8
 declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32)
 
 define <16 x float> @test_mul_ps(<16 x float> %a, <16 x float> %b) {
+;
 ; CHECK-LABEL: @test_mul_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -3196,6 +3415,7 @@ define <16 x float> @test_mul_ps(<16 x float> %a, <16 x float> %b) {
 }
 
 define <16 x float> @test_mul_ps_round(<16 x float> %a, <16 x float> %b) {
+;
 ; CHECK-LABEL: @test_mul_ps_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -3205,6 +3425,7 @@ define <16 x float> @test_mul_ps_round(<16 x float> %a, <16 x float> %b) {
 }
 
 define <16 x float> @test_mul_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+;
 ; CHECK-LABEL: @test_mul_ps_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -3218,6 +3439,7 @@ define <16 x float> @test_mul_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl
 }
 
 define <16 x float> @test_mul_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+;
 ; CHECK-LABEL: @test_mul_ps_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -3233,6 +3455,7 @@ define <16 x float> @test_mul_ps_mask_round(<16 x float> %a, <16 x float> %b, <1
 declare <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double>, <8 x double>, i32)
 
 define <8 x double> @test_mul_pd(<8 x double> %a, <8 x double> %b) {
+;
 ; CHECK-LABEL: @test_mul_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -3242,6 +3465,7 @@ define <8 x double> @test_mul_pd(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @test_mul_pd_round(<8 x double> %a, <8 x double> %b) {
+;
 ; CHECK-LABEL: @test_mul_pd_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -3251,6 +3475,7 @@ define <8 x double> @test_mul_pd_round(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @test_mul_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mul_pd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -3264,6 +3489,7 @@ define <8 x double> @test_mul_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou
 }
 
 define <8 x double> @test_mul_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_mul_pd_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -3279,6 +3505,7 @@ define <8 x double> @test_mul_pd_mask_round(<8 x double> %a, <8 x double> %b, <8
 declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32)
 
 define <16 x float> @test_div_ps(<16 x float> %a, <16 x float> %b) {
+;
 ; CHECK-LABEL: @test_div_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -3288,6 +3515,7 @@ define <16 x float> @test_div_ps(<16 x float> %a, <16 x float> %b) {
 }
 
 define <16 x float> @test_div_ps_round(<16 x float> %a, <16 x float> %b) {
+;
 ; CHECK-LABEL: @test_div_ps_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    ret <16 x float> [[TMP1]]
@@ -3297,6 +3525,7 @@ define <16 x float> @test_div_ps_round(<16 x float> %a, <16 x float> %b) {
 }
 
 define <16 x float> @test_div_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+;
 ; CHECK-LABEL: @test_div_ps_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -3310,6 +3539,7 @@ define <16 x float> @test_div_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl
 }
 
 define <16 x float> @test_div_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+;
 ; CHECK-LABEL: @test_div_ps_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -3325,6 +3555,7 @@ define <16 x float> @test_div_ps_mask_round(<16 x float> %a, <16 x float> %b, <1
 declare <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double>, <8 x double>, i32)
 
 define <8 x double> @test_div_pd(<8 x double> %a, <8 x double> %b) {
+;
 ; CHECK-LABEL: @test_div_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -3334,6 +3565,7 @@ define <8 x double> @test_div_pd(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @test_div_pd_round(<8 x double> %a, <8 x double> %b) {
+;
 ; CHECK-LABEL: @test_div_pd_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    ret <8 x double> [[TMP1]]
@@ -3343,6 +3575,7 @@ define <8 x double> @test_div_pd_round(<8 x double> %a, <8 x double> %b) {
 }
 
 define <8 x double> @test_div_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_div_pd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -3356,6 +3589,7 @@ define <8 x double> @test_div_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou
 }
 
 define <8 x double> @test_div_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+;
 ; CHECK-LABEL: @test_div_pd_mask_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -3371,6 +3605,7 @@ define <8 x double> @test_div_pd_mask_round(<8 x double> %a, <8 x double> %b, <8
 declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
 
 define i32 @test_comi_ss_0(float %a, float %b) {
+;
 ; CHECK-LABEL: @test_comi_ss_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i32 0
@@ -3392,6 +3627,7 @@ define i32 @test_comi_ss_0(float %a, float %b) {
 declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
 
 define i32 @test_comi_sd_0(double %a, double %b) {
+;
 ; CHECK-LABEL: @test_comi_sd_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[B:%.*]], i32 0
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-fma.ll b/llvm/test/Transforms/InstCombine/X86/x86-fma.ll
index cddb1bf9c4e00a..959ee7efa55717 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-fma.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-fma.ll
@@ -3,6 +3,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define <4 x float> @test_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+;
 ; CHECK-LABEL: @test_vfmadd_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
@@ -26,6 +27,7 @@ define <4 x float> @test_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %
 }
 
 define float @test_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+;
 ; CHECK-LABEL: @test_vfmadd_ss_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
@@ -46,6 +48,7 @@ define float @test_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 }
 
 define float @test_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+;
 ; CHECK-LABEL: @test_vfmadd_ss_1(
 ; CHECK-NEXT:    ret float 1.000000e+00
 ;
@@ -62,6 +65,7 @@ define float @test_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 }
 
 define <2 x double> @test_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+;
 ; CHECK-LABEL: @test_vfmadd_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -81,6 +85,7 @@ define <2 x double> @test_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x doubl
 }
 
 define double @test_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+;
 ; CHECK-LABEL: @test_vfmadd_sd_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
@@ -99,6 +104,7 @@ define double @test_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %
 }
 
 define double @test_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+;
 ; CHECK-LABEL: @test_vfmadd_sd_1(
 ; CHECK-NEXT:    ret double 1.000000e+00
 ;
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vec_demanded_elts-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-vec_demanded_elts-inseltpoison.ll
index 719389470faca4..5cab62364c24d9 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-vec_demanded_elts-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vec_demanded_elts-inseltpoison.ll
@@ -1,12 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define i16 @test1(float %f) {
+;
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[TMP281:%.*]] = fadd float %f, -1.000000e+00
-; CHECK-NEXT:    [[TMP373:%.*]] = fmul float [[TMP281]], 5.000000e-01
-; CHECK-NEXT:    [[TMP374:%.*]] = insertelement <4 x float> poison, float [[TMP373]], i32 0
-; CHECK-NEXT:    [[TMP48:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> [[TMP374]], <4 x float> <float 6.553500e+04, float poison, float poison, float poison>)
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd float [[F:%.*]], -1.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul float [[TMP1]], 5.000000e-01
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP48:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> [[TMP3]], <4 x float> <float 6.553500e+04, float poison, float poison, float poison>)
 ; CHECK-NEXT:    [[TMP59:%.*]] = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> [[TMP48]], <4 x float> <float 0.000000e+00, float poison, float poison, float poison>)
 ; CHECK-NEXT:    [[TMP_UPGRD_1:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[TMP59]])
 ; CHECK-NEXT:    [[TMP69:%.*]] = trunc i32 [[TMP_UPGRD_1]] to i16
@@ -26,23 +28,24 @@ define i16 @test1(float %f) {
 }
 
 define i64 @test3(float %f, double %d) {
+;
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[V00:%.*]] = insertelement <4 x float> poison, float %f, i32 0
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> [[V00]])
-; CHECK-NEXT:    [[V10:%.*]] = insertelement <4 x float> poison, float %f, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> [[V10]])
-; CHECK-NEXT:    [[V20:%.*]] = insertelement <4 x float> poison, float %f, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[V20]])
-; CHECK-NEXT:    [[V30:%.*]] = insertelement <4 x float> poison, float %f, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> [[V30]])
-; CHECK-NEXT:    [[V40:%.*]] = insertelement <2 x double> poison, double %d, i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> [[V40]])
-; CHECK-NEXT:    [[V50:%.*]] = insertelement <2 x double> poison, double %d, i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> [[V50]])
-; CHECK-NEXT:    [[V60:%.*]] = insertelement <2 x double> poison, double %d, i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> [[V60]])
-; CHECK-NEXT:    [[V70:%.*]] = insertelement <2 x double> poison, double %d, i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> [[V70]])
+; CHECK-NEXT:    [[V03:%.*]] = insertelement <4 x float> poison, float [[F:%.*]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> [[V03]])
+; CHECK-NEXT:    [[V13:%.*]] = insertelement <4 x float> poison, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> [[V13]])
+; CHECK-NEXT:    [[V23:%.*]] = insertelement <4 x float> poison, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[V23]])
+; CHECK-NEXT:    [[V33:%.*]] = insertelement <4 x float> poison, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> [[V33]])
+; CHECK-NEXT:    [[V41:%.*]] = insertelement <2 x double> poison, double [[D:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> [[V41]])
+; CHECK-NEXT:    [[V51:%.*]] = insertelement <2 x double> poison, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> [[V51]])
+; CHECK-NEXT:    [[V61:%.*]] = insertelement <2 x double> poison, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> [[V61]])
+; CHECK-NEXT:    [[V71:%.*]] = insertelement <2 x double> poison, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> [[V71]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP4]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]]
diff --git a/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll b/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll
index a7b233e5f71bb7..eba990b3c7965f 100644
--- a/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll
@@ -2,6 +2,7 @@
 ; RUN: opt -instcombine -S < %s | FileCheck %s
 
 define i32 @extract_load(<4 x i32>* %p) {
+;
 ; CHECK-LABEL: @extract_load(
 ; CHECK-NEXT:    [[X:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4
 ; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x i32> [[X]], i32 1
@@ -13,6 +14,7 @@ define i32 @extract_load(<4 x i32>* %p) {
 }
 
 define double @extract_load_fp(<4 x double>* %p) {
+;
 ; CHECK-LABEL: @extract_load_fp(
 ; CHECK-NEXT:    [[X:%.*]] = load <4 x double>, <4 x double>* [[P:%.*]], align 32
 ; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 3
@@ -24,6 +26,7 @@ define double @extract_load_fp(<4 x double>* %p) {
 }
 
 define double @extract_load_volatile(<4 x double>* %p) {
+;
 ; CHECK-LABEL: @extract_load_volatile(
 ; CHECK-NEXT:    [[X:%.*]] = load volatile <4 x double>, <4 x double>* [[P:%.*]], align 32
 ; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 2
@@ -35,6 +38,7 @@ define double @extract_load_volatile(<4 x double>* %p) {
 }
 
 define double @extract_load_extra_use(<4 x double>* %p, <4 x double>* %p2) {
+;
 ; CHECK-LABEL: @extract_load_extra_use(
 ; CHECK-NEXT:    [[X:%.*]] = load <4 x double>, <4 x double>* [[P:%.*]], align 8
 ; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 0
@@ -48,6 +52,7 @@ define double @extract_load_extra_use(<4 x double>* %p, <4 x double>* %p2) {
 }
 
 define double @extract_load_variable_index(<4 x double>* %p, i32 %y) {
+;
 ; CHECK-LABEL: @extract_load_variable_index(
 ; CHECK-NEXT:    [[X:%.*]] = load <4 x double>, <4 x double>* [[P:%.*]], align 32
 ; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 [[Y:%.*]]
@@ -59,6 +64,7 @@ define double @extract_load_variable_index(<4 x double>* %p, i32 %y) {
 }
 
 define void @scalarize_phi(i32 * %n, float * %inout) {
+;
 ; CHECK-LABEL: @scalarize_phi(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[T0:%.*]] = load volatile float, float* [[INOUT:%.*]], align 4
@@ -103,6 +109,7 @@ for.end:
 }
 
 define float @extract_element_binop_splat_constant_index(<4 x float> %x) {
+;
 ; CHECK-LABEL: @extract_element_binop_splat_constant_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2
 ; CHECK-NEXT:    [[R:%.*]] = fadd float [[TMP1]], 0x4002A3D700000000
@@ -114,6 +121,7 @@ define float @extract_element_binop_splat_constant_index(<4 x float> %x) {
 }
 
 define double @extract_element_binop_splat_with_undef_constant_index(<2 x double> %x) {
+;
 ; CHECK-LABEL: @extract_element_binop_splat_with_undef_constant_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0
 ; CHECK-NEXT:    [[R:%.*]] = fdiv double 4.200000e+01, [[TMP1]]
@@ -125,6 +133,7 @@ define double @extract_element_binop_splat_with_undef_constant_index(<2 x double
 }
 
 define float @extract_element_binop_nonsplat_constant_index(<2 x float> %x) {
+;
 ; CHECK-LABEL: @extract_element_binop_nonsplat_constant_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1
 ; CHECK-NEXT:    [[R:%.*]] = fmul float [[TMP1]], 4.300000e+01
@@ -136,6 +145,7 @@ define float @extract_element_binop_nonsplat_constant_index(<2 x float> %x) {
 }
 
 define i8 @extract_element_binop_splat_variable_index(<4 x i8> %x, i32 %y) {
+;
 ; CHECK-LABEL: @extract_element_binop_splat_variable_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i8 [[TMP1]], 42
@@ -147,6 +157,7 @@ define i8 @extract_element_binop_splat_variable_index(<4 x i8> %x, i32 %y) {
 }
 
 define i8 @extract_element_binop_splat_with_undef_variable_index(<4 x i8> %x, i32 %y) {
+;
 ; CHECK-LABEL: @extract_element_binop_splat_with_undef_variable_index(
 ; CHECK-NEXT:    [[B:%.*]] = mul <4 x i8> [[X:%.*]], <i8 42, i8 42, i8 undef, i8 42>
 ; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i8> [[B]], i32 [[Y:%.*]]
@@ -158,6 +169,7 @@ define i8 @extract_element_binop_splat_with_undef_variable_index(<4 x i8> %x, i3
 }
 
 define i8 @extract_element_binop_nonsplat_variable_index(<4 x i8> %x, i32 %y) {
+;
 ; CHECK-LABEL: @extract_element_binop_nonsplat_variable_index(
 ; CHECK-NEXT:    [[B:%.*]] = lshr <4 x i8> [[X:%.*]], <i8 4, i8 3, i8 undef, i8 2>
 ; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i8> [[B]], i32 [[Y:%.*]]
@@ -169,6 +181,7 @@ define i8 @extract_element_binop_nonsplat_variable_index(<4 x i8> %x, i32 %y) {
 }
 
 define float @extract_element_load(<4 x float> %x, <4 x float>* %ptr) {
+;
 ; CHECK-LABEL: @extract_element_load(
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <4 x float>, <4 x float>* [[PTR:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[LOAD]], i32 2
@@ -183,6 +196,7 @@ define float @extract_element_load(<4 x float> %x, <4 x float>* %ptr) {
 }
 
 define float @extract_element_multi_Use_load(<4 x float> %x, <4 x float>* %ptr0, <4 x float>* %ptr1) {
+;
 ; CHECK-LABEL: @extract_element_multi_Use_load(
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <4 x float>, <4 x float>* [[PTR0:%.*]], align 16
 ; CHECK-NEXT:    store <4 x float> [[LOAD]], <4 x float>* [[PTR1:%.*]], align 16
@@ -198,6 +212,7 @@ define float @extract_element_multi_Use_load(<4 x float> %x, <4 x float>* %ptr0,
 }
 
 define float @extract_element_variable_index(<4 x float> %x, i32 %y) {
+;
 ; CHECK-LABEL: @extract_element_variable_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = fadd float [[TMP1]], 1.000000e+00
@@ -209,6 +224,7 @@ define float @extract_element_variable_index(<4 x float> %x, i32 %y) {
 }
 
 define float @extelt_binop_insertelt(<4 x float> %A, <4 x float> %B, float %f) {
+;
 ; CHECK-LABEL: @extelt_binop_insertelt(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
 ; CHECK-NEXT:    [[E:%.*]] = fmul nnan float [[TMP1]], [[F:%.*]]
@@ -224,6 +240,7 @@ define float @extelt_binop_insertelt(<4 x float> %A, <4 x float> %B, float %f) {
 ; FIXME: We should propagate the IR flags including wrapping flags.
 
 define i32 @extelt_binop_binop_insertelt(<4 x i32> %A, <4 x i32> %B, i32 %f) {
+;
 ; CHECK-LABEL: @extelt_binop_binop_insertelt(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], [[F:%.*]]
@@ -239,6 +256,7 @@ define i32 @extelt_binop_binop_insertelt(<4 x i32> %A, <4 x i32> %B, i32 %f) {
 }
 
 define float @extract_element_constant_vector_variable_index(i32 %y) {
+;
 ; CHECK-LABEL: @extract_element_constant_vector_variable_index(
 ; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, i32 [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[R]]
@@ -248,6 +266,7 @@ define float @extract_element_constant_vector_variable_index(i32 %y) {
 }
 
 define i1 @cheap_to_extract_icmp(<4 x i32> %x, <4 x i1> %y) {
+;
 ; CHECK-LABEL: @cheap_to_extract_icmp(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
@@ -262,6 +281,7 @@ define i1 @cheap_to_extract_icmp(<4 x i32> %x, <4 x i1> %y) {
 }
 
 define i1 @cheap_to_extract_fcmp(<4 x float> %x, <4 x i1> %y) {
+;
 ; CHECK-LABEL: @cheap_to_extract_fcmp(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp oeq float [[TMP1]], 0.000000e+00
@@ -276,6 +296,7 @@ define i1 @cheap_to_extract_fcmp(<4 x float> %x, <4 x i1> %y) {
 }
 
 define i1 @extractelt_vector_icmp_constrhs(<2 x i32> %arg) {
+;
 ; CHECK-LABEL: @extractelt_vector_icmp_constrhs(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[ARG:%.*]], i32 0
 ; CHECK-NEXT:    [[EXT:%.*]] = icmp eq i32 [[TMP1]], 0
@@ -287,6 +308,7 @@ define i1 @extractelt_vector_icmp_constrhs(<2 x i32> %arg) {
 }
 
 define i1 @extractelt_vector_fcmp_constrhs(<2 x float> %arg) {
+;
 ; CHECK-LABEL: @extractelt_vector_fcmp_constrhs(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[ARG:%.*]], i32 0
 ; CHECK-NEXT:    [[EXT:%.*]] = fcmp oeq float [[TMP1]], 0.000000e+00
@@ -298,6 +320,7 @@ define i1 @extractelt_vector_fcmp_constrhs(<2 x float> %arg) {
 }
 
 define i1 @extractelt_vector_icmp_constrhs_dynidx(<2 x i32> %arg, i32 %idx) {
+;
 ; CHECK-LABEL: @extractelt_vector_icmp_constrhs_dynidx(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[ARG:%.*]], i32 [[IDX:%.*]]
 ; CHECK-NEXT:    [[EXT:%.*]] = icmp eq i32 [[TMP1]], 0
@@ -309,6 +332,7 @@ define i1 @extractelt_vector_icmp_constrhs_dynidx(<2 x i32> %arg, i32 %idx) {
 }
 
 define i1 @extractelt_vector_fcmp_constrhs_dynidx(<2 x float> %arg, i32 %idx) {
+;
 ; CHECK-LABEL: @extractelt_vector_fcmp_constrhs_dynidx(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[ARG:%.*]], i32 [[IDX:%.*]]
 ; CHECK-NEXT:    [[EXT:%.*]] = fcmp oeq float [[TMP1]], 0.000000e+00
@@ -320,6 +344,7 @@ define i1 @extractelt_vector_fcmp_constrhs_dynidx(<2 x float> %arg, i32 %idx) {
 }
 
 define i1 @extractelt_vector_fcmp_not_cheap_to_scalarize_multi_use(<2 x float> %arg0, <2 x float> %arg1, <2 x float> %arg2, i32 %idx) {
+;
 ; CHECK-LABEL: @extractelt_vector_fcmp_not_cheap_to_scalarize_multi_use(
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x float> [[ARG1:%.*]], [[ARG2:%.*]]
 ; CHECK-NEXT:    store volatile <2 x float> [[ADD]], <2 x float>* undef, align 8
diff --git a/llvm/test/Transforms/InstCombine/scalarization.ll b/llvm/test/Transforms/InstCombine/scalarization.ll
index 08cc534b76f8e2..2bbba7966ad185 100644
--- a/llvm/test/Transforms/InstCombine/scalarization.ll
+++ b/llvm/test/Transforms/InstCombine/scalarization.ll
@@ -2,6 +2,7 @@
 ; RUN: opt -instcombine -S < %s | FileCheck %s
 
 define i32 @extract_load(<4 x i32>* %p) {
+;
 ; CHECK-LABEL: @extract_load(
 ; CHECK-NEXT:    [[X:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4
 ; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x i32> [[X]], i32 1
@@ -13,6 +14,7 @@ define i32 @extract_load(<4 x i32>* %p) {
 }
 
 define double @extract_load_fp(<4 x double>* %p) {
+;
 ; CHECK-LABEL: @extract_load_fp(
 ; CHECK-NEXT:    [[X:%.*]] = load <4 x double>, <4 x double>* [[P:%.*]], align 32
 ; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 3
@@ -24,6 +26,7 @@ define double @extract_load_fp(<4 x double>* %p) {
 }
 
 define double @extract_load_volatile(<4 x double>* %p) {
+;
 ; CHECK-LABEL: @extract_load_volatile(
 ; CHECK-NEXT:    [[X:%.*]] = load volatile <4 x double>, <4 x double>* [[P:%.*]], align 32
 ; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 2
@@ -35,6 +38,7 @@ define double @extract_load_volatile(<4 x double>* %p) {
 }
 
 define double @extract_load_extra_use(<4 x double>* %p, <4 x double>* %p2) {
+;
 ; CHECK-LABEL: @extract_load_extra_use(
 ; CHECK-NEXT:    [[X:%.*]] = load <4 x double>, <4 x double>* [[P:%.*]], align 8
 ; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 0
@@ -48,6 +52,7 @@ define double @extract_load_extra_use(<4 x double>* %p, <4 x double>* %p2) {
 }
 
 define double @extract_load_variable_index(<4 x double>* %p, i32 %y) {
+;
 ; CHECK-LABEL: @extract_load_variable_index(
 ; CHECK-NEXT:    [[X:%.*]] = load <4 x double>, <4 x double>* [[P:%.*]], align 32
 ; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 [[Y:%.*]]
@@ -59,6 +64,7 @@ define double @extract_load_variable_index(<4 x double>* %p, i32 %y) {
 }
 
 define void @scalarize_phi(i32 * %n, float * %inout) {
+;
 ; CHECK-LABEL: @scalarize_phi(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[T0:%.*]] = load volatile float, float* [[INOUT:%.*]], align 4
@@ -103,6 +109,7 @@ for.end:
 }
 
 define float @extract_element_binop_splat_constant_index(<4 x float> %x) {
+;
 ; CHECK-LABEL: @extract_element_binop_splat_constant_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2
 ; CHECK-NEXT:    [[R:%.*]] = fadd float [[TMP1]], 0x4002A3D700000000
@@ -114,6 +121,7 @@ define float @extract_element_binop_splat_constant_index(<4 x float> %x) {
 }
 
 define double @extract_element_binop_splat_with_undef_constant_index(<2 x double> %x) {
+;
 ; CHECK-LABEL: @extract_element_binop_splat_with_undef_constant_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0
 ; CHECK-NEXT:    [[R:%.*]] = fdiv double 4.200000e+01, [[TMP1]]
@@ -125,6 +133,7 @@ define double @extract_element_binop_splat_with_undef_constant_index(<2 x double
 }
 
 define float @extract_element_binop_nonsplat_constant_index(<2 x float> %x) {
+;
 ; CHECK-LABEL: @extract_element_binop_nonsplat_constant_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1
 ; CHECK-NEXT:    [[R:%.*]] = fmul float [[TMP1]], 4.300000e+01
@@ -136,6 +145,7 @@ define float @extract_element_binop_nonsplat_constant_index(<2 x float> %x) {
 }
 
 define i8 @extract_element_binop_splat_variable_index(<4 x i8> %x, i32 %y) {
+;
 ; CHECK-LABEL: @extract_element_binop_splat_variable_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i8 [[TMP1]], 42
@@ -147,6 +157,7 @@ define i8 @extract_element_binop_splat_variable_index(<4 x i8> %x, i32 %y) {
 }
 
 define i8 @extract_element_binop_splat_with_undef_variable_index(<4 x i8> %x, i32 %y) {
+;
 ; CHECK-LABEL: @extract_element_binop_splat_with_undef_variable_index(
 ; CHECK-NEXT:    [[B:%.*]] = mul <4 x i8> [[X:%.*]], <i8 42, i8 42, i8 undef, i8 42>
 ; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i8> [[B]], i32 [[Y:%.*]]
@@ -158,6 +169,7 @@ define i8 @extract_element_binop_splat_with_undef_variable_index(<4 x i8> %x, i3
 }
 
 define i8 @extract_element_binop_nonsplat_variable_index(<4 x i8> %x, i32 %y) {
+;
 ; CHECK-LABEL: @extract_element_binop_nonsplat_variable_index(
 ; CHECK-NEXT:    [[B:%.*]] = lshr <4 x i8> [[X:%.*]], <i8 4, i8 3, i8 undef, i8 2>
 ; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i8> [[B]], i32 [[Y:%.*]]
@@ -169,6 +181,7 @@ define i8 @extract_element_binop_nonsplat_variable_index(<4 x i8> %x, i32 %y) {
 }
 
 define float @extract_element_load(<4 x float> %x, <4 x float>* %ptr) {
+;
 ; CHECK-LABEL: @extract_element_load(
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <4 x float>, <4 x float>* [[PTR:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[LOAD]], i32 2
@@ -183,6 +196,7 @@ define float @extract_element_load(<4 x float> %x, <4 x float>* %ptr) {
 }
 
 define float @extract_element_multi_Use_load(<4 x float> %x, <4 x float>* %ptr0, <4 x float>* %ptr1) {
+;
 ; CHECK-LABEL: @extract_element_multi_Use_load(
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <4 x float>, <4 x float>* [[PTR0:%.*]], align 16
 ; CHECK-NEXT:    store <4 x float> [[LOAD]], <4 x float>* [[PTR1:%.*]], align 16
@@ -198,6 +212,7 @@ define float @extract_element_multi_Use_load(<4 x float> %x, <4 x float>* %ptr0,
 }
 
 define float @extract_element_variable_index(<4 x float> %x, i32 %y) {
+;
 ; CHECK-LABEL: @extract_element_variable_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = fadd float [[TMP1]], 1.000000e+00
@@ -209,6 +224,7 @@ define float @extract_element_variable_index(<4 x float> %x, i32 %y) {
 }
 
 define float @extelt_binop_insertelt(<4 x float> %A, <4 x float> %B, float %f) {
+;
 ; CHECK-LABEL: @extelt_binop_insertelt(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
 ; CHECK-NEXT:    [[E:%.*]] = fmul nnan float [[TMP1]], [[F:%.*]]
@@ -222,6 +238,7 @@ define float @extelt_binop_insertelt(<4 x float> %A, <4 x float> %B, float %f) {
 
 ; We recurse to find a scalarizable operand.
 define i32 @extelt_binop_binop_insertelt(<4 x i32> %A, <4 x i32> %B, i32 %f) {
+;
 ; CHECK-LABEL: @extelt_binop_binop_insertelt(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], [[F:%.*]]
@@ -237,6 +254,7 @@ define i32 @extelt_binop_binop_insertelt(<4 x i32> %A, <4 x i32> %B, i32 %f) {
 }
 
 define float @extract_element_constant_vector_variable_index(i32 %y) {
+;
 ; CHECK-LABEL: @extract_element_constant_vector_variable_index(
 ; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, i32 [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[R]]
@@ -246,6 +264,7 @@ define float @extract_element_constant_vector_variable_index(i32 %y) {
 }
 
 define i1 @cheap_to_extract_icmp(<4 x i32> %x, <4 x i1> %y) {
+;
 ; CHECK-LABEL: @cheap_to_extract_icmp(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
@@ -260,6 +279,7 @@ define i1 @cheap_to_extract_icmp(<4 x i32> %x, <4 x i1> %y) {
 }
 
 define i1 @cheap_to_extract_fcmp(<4 x float> %x, <4 x i1> %y) {
+;
 ; CHECK-LABEL: @cheap_to_extract_fcmp(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp oeq float [[TMP1]], 0.000000e+00
@@ -274,6 +294,7 @@ define i1 @cheap_to_extract_fcmp(<4 x float> %x, <4 x i1> %y) {
 }
 
 define i1 @extractelt_vector_icmp_constrhs(<2 x i32> %arg) {
+;
 ; CHECK-LABEL: @extractelt_vector_icmp_constrhs(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[ARG:%.*]], i32 0
 ; CHECK-NEXT:    [[EXT:%.*]] = icmp eq i32 [[TMP1]], 0
@@ -285,6 +306,7 @@ define i1 @extractelt_vector_icmp_constrhs(<2 x i32> %arg) {
 }
 
 define i1 @extractelt_vector_fcmp_constrhs(<2 x float> %arg) {
+;
 ; CHECK-LABEL: @extractelt_vector_fcmp_constrhs(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[ARG:%.*]], i32 0
 ; CHECK-NEXT:    [[EXT:%.*]] = fcmp oeq float [[TMP1]], 0.000000e+00
@@ -296,6 +318,7 @@ define i1 @extractelt_vector_fcmp_constrhs(<2 x float> %arg) {
 }
 
 define i1 @extractelt_vector_icmp_constrhs_dynidx(<2 x i32> %arg, i32 %idx) {
+;
 ; CHECK-LABEL: @extractelt_vector_icmp_constrhs_dynidx(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[ARG:%.*]], i32 [[IDX:%.*]]
 ; CHECK-NEXT:    [[EXT:%.*]] = icmp eq i32 [[TMP1]], 0
@@ -307,6 +330,7 @@ define i1 @extractelt_vector_icmp_constrhs_dynidx(<2 x i32> %arg, i32 %idx) {
 }
 
 define i1 @extractelt_vector_fcmp_constrhs_dynidx(<2 x float> %arg, i32 %idx) {
+;
 ; CHECK-LABEL: @extractelt_vector_fcmp_constrhs_dynidx(
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[ARG:%.*]], i32 [[IDX:%.*]]
 ; CHECK-NEXT:    [[EXT:%.*]] = fcmp oeq float [[TMP1]], 0.000000e+00
@@ -318,6 +342,7 @@ define i1 @extractelt_vector_fcmp_constrhs_dynidx(<2 x float> %arg, i32 %idx) {
 }
 
 define i1 @extractelt_vector_fcmp_not_cheap_to_scalarize_multi_use(<2 x float> %arg0, <2 x float> %arg1, <2 x float> %arg2, i32 %idx) {
+;
 ; CHECK-LABEL: @extractelt_vector_fcmp_not_cheap_to_scalarize_multi_use(
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x float> [[ARG1:%.*]], [[ARG2:%.*]]
 ; CHECK-NEXT:    store volatile <2 x float> [[ADD]], <2 x float>* undef, align 8
diff --git a/llvm/test/Transforms/InstCombine/sincospi.ll b/llvm/test/Transforms/InstCombine/sincospi.ll
index 10342c50096179..51a18bc33e28fa 100644
--- a/llvm/test/Transforms/InstCombine/sincospi.ll
+++ b/llvm/test/Transforms/InstCombine/sincospi.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -instcombine -S < %s -mtriple=x86_64-apple-macosx10.9 | FileCheck %s --check-prefix=CHECK-FLOAT-IN-VEC
 ; RUN: opt -instcombine -S < %s -mtriple=arm-apple-ios7.0 | FileCheck %s
 ; RUN: opt -instcombine -S < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
@@ -9,93 +10,175 @@
 attributes #0 = { readnone nounwind }
 
 declare float @__sinpif(float %x) #0
-declare float @__cospif(float %x) #0 
+declare float @__cospif(float %x) #0
 
 declare double @__sinpi(double %x) #0
-declare double @__cospi(double %x) #0 
+declare double @__cospi(double %x) #0
 
 @var32 = global float 0.0
 @var64 = global double 0.0
 
 define float @test_instbased_f32() {
-       %val = load float, float* @var32
-       %sin = call float @__sinpif(float %val) #0
-       %cos = call float @__cospif(float %val) #0
-       %res = fadd float %sin, %cos
-       ret float %res
-; CHECK-FLOAT-IN-VEC: [[VAL:%[a-z0-9]+]] = load float, float* @var32
-; CHECK-FLOAT-IN-VEC: [[SINCOS:%[a-z0-9]+]] = call <2 x float> @__sincospif_stret(float [[VAL]])
-; CHECK-FLOAT-IN-VEC: extractelement <2 x float> [[SINCOS]], i32 0
-; CHECK-FLOAT-IN-VEC: extractelement <2 x float> [[SINCOS]], i32 1
-
-; CHECK: [[VAL:%[a-z0-9]+]] = load float, float* @var32
-; CHECK: [[SINCOS:%[a-z0-9]+]] = call { float, float } @__sincospif_stret(float [[VAL]])
-; CHECK: extractvalue { float, float } [[SINCOS]], 0
-; CHECK: extractvalue { float, float } [[SINCOS]], 1
-
-; CHECK-NO-SINCOS: call float @__sinpif
-; CHECK-NO-SINCOS: call float @__cospif
+; CHECK-FLOAT-IN-VEC-LABEL: @test_instbased_f32(
+; CHECK-FLOAT-IN-VEC-NEXT:    [[VAL:%.*]] = load float, float* @var32, align 4
+; CHECK-FLOAT-IN-VEC-NEXT:    [[SINCOSPI:%.*]] = call <2 x float> @__sincospif_stret(float [[VAL]])
+; CHECK-FLOAT-IN-VEC-NEXT:    [[SINPI:%.*]] = extractelement <2 x float> [[SINCOSPI]], i32 0
+; CHECK-FLOAT-IN-VEC-NEXT:    [[COSPI:%.*]] = extractelement <2 x float> [[SINCOSPI]], i32 1
+; CHECK-FLOAT-IN-VEC-NEXT:    [[SIN:%.*]] = call float @__sinpif(float [[VAL]]) #[[ATTR0:[0-9]+]]
+; CHECK-FLOAT-IN-VEC-NEXT:    [[COS:%.*]] = call float @__cospif(float [[VAL]]) #[[ATTR0]]
+; CHECK-FLOAT-IN-VEC-NEXT:    [[RES:%.*]] = fadd float [[SINPI]], [[COSPI]]
+; CHECK-FLOAT-IN-VEC-NEXT:    ret float [[RES]]
+;
+; CHECK-LABEL: @test_instbased_f32(
+; CHECK-NEXT:    [[VAL:%.*]] = load float, float* @var32, align 4
+; CHECK-NEXT:    [[SINCOSPI:%.*]] = call { float, float } @__sincospif_stret(float [[VAL]])
+; CHECK-NEXT:    [[SINPI:%.*]] = extractvalue { float, float } [[SINCOSPI]], 0
+; CHECK-NEXT:    [[COSPI:%.*]] = extractvalue { float, float } [[SINCOSPI]], 1
+; CHECK-NEXT:    [[SIN:%.*]] = call float @__sinpif(float [[VAL]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    [[COS:%.*]] = call float @__cospif(float [[VAL]]) #[[ATTR0]]
+; CHECK-NEXT:    [[RES:%.*]] = fadd float [[SINPI]], [[COSPI]]
+; CHECK-NEXT:    ret float [[RES]]
+;
+; CHECK-NO-SINCOS-LABEL: @test_instbased_f32(
+; CHECK-NO-SINCOS-NEXT:    [[VAL:%.*]] = load float, float* @var32, align 4
+; CHECK-NO-SINCOS-NEXT:    [[SIN:%.*]] = call float @__sinpif(float [[VAL]]) #[[ATTR0:[0-9]+]]
+; CHECK-NO-SINCOS-NEXT:    [[COS:%.*]] = call float @__cospif(float [[VAL]]) #[[ATTR0]]
+; CHECK-NO-SINCOS-NEXT:    [[RES:%.*]] = fadd float [[SIN]], [[COS]]
+; CHECK-NO-SINCOS-NEXT:    ret float [[RES]]
+;
+  %val = load float, float* @var32
+  %sin = call float @__sinpif(float %val) #0
+  %cos = call float @__cospif(float %val) #0
+  %res = fadd float %sin, %cos
+  ret float %res
+
+
 }
 
 define float @test_constant_f32() {
-       %sin = call float @__sinpif(float 1.0) #0
-       %cos = call float @__cospif(float 1.0) #0
-       %res = fadd float %sin, %cos
-       ret float %res
-; CHECK-FLOAT-IN-VEC: [[SINCOS:%[a-z0-9]+]] = call <2 x float> @__sincospif_stret(float 1.000000e+00)
-; CHECK-FLOAT-IN-VEC: extractelement <2 x float> [[SINCOS]], i32 0
-; CHECK-FLOAT-IN-VEC: extractelement <2 x float> [[SINCOS]], i32 1
-
-; CHECK: [[SINCOS:%[a-z0-9]+]] = call { float, float } @__sincospif_stret(float 1.000000e+00)
-; CHECK: extractvalue { float, float } [[SINCOS]], 0
-; CHECK: extractvalue { float, float } [[SINCOS]], 1
-
-; CHECK-NO-SINCOS: call float @__sinpif
-; CHECK-NO-SINCOS: call float @__cospif
+; CHECK-FLOAT-IN-VEC-LABEL: @test_constant_f32(
+; CHECK-FLOAT-IN-VEC-NEXT:    [[SINCOSPI:%.*]] = call <2 x float> @__sincospif_stret(float 1.000000e+00)
+; CHECK-FLOAT-IN-VEC-NEXT:    [[SINPI:%.*]] = extractelement <2 x float> [[SINCOSPI]], i32 0
+; CHECK-FLOAT-IN-VEC-NEXT:    [[COSPI:%.*]] = extractelement <2 x float> [[SINCOSPI]], i32 1
+; CHECK-FLOAT-IN-VEC-NEXT:    [[SIN:%.*]] = call float @__sinpif(float 1.000000e+00) #[[ATTR0]]
+; CHECK-FLOAT-IN-VEC-NEXT:    [[COS:%.*]] = call float @__cospif(float 1.000000e+00) #[[ATTR0]]
+; CHECK-FLOAT-IN-VEC-NEXT:    [[RES:%.*]] = fadd float [[SINPI]], [[COSPI]]
+; CHECK-FLOAT-IN-VEC-NEXT:    ret float [[RES]]
+;
+; CHECK-LABEL: @test_constant_f32(
+; CHECK-NEXT:    [[SINCOSPI:%.*]] = call { float, float } @__sincospif_stret(float 1.000000e+00)
+; CHECK-NEXT:    [[SINPI:%.*]] = extractvalue { float, float } [[SINCOSPI]], 0
+; CHECK-NEXT:    [[COSPI:%.*]] = extractvalue { float, float } [[SINCOSPI]], 1
+; CHECK-NEXT:    [[SIN:%.*]] = call float @__sinpif(float 1.000000e+00) #[[ATTR0]]
+; CHECK-NEXT:    [[COS:%.*]] = call float @__cospif(float 1.000000e+00) #[[ATTR0]]
+; CHECK-NEXT:    [[RES:%.*]] = fadd float [[SINPI]], [[COSPI]]
+; CHECK-NEXT:    ret float [[RES]]
+;
+; CHECK-NO-SINCOS-LABEL: @test_constant_f32(
+; CHECK-NO-SINCOS-NEXT:    [[SIN:%.*]] = call float @__sinpif(float 1.000000e+00) #[[ATTR0]]
+; CHECK-NO-SINCOS-NEXT:    [[COS:%.*]] = call float @__cospif(float 1.000000e+00) #[[ATTR0]]
+; CHECK-NO-SINCOS-NEXT:    [[RES:%.*]] = fadd float [[SIN]], [[COS]]
+; CHECK-NO-SINCOS-NEXT:    ret float [[RES]]
+;
+  %sin = call float @__sinpif(float 1.0) #0
+  %cos = call float @__cospif(float 1.0) #0
+  %res = fadd float %sin, %cos
+  ret float %res
+
+
 }
 
 define double @test_instbased_f64() {
-       %val = load double, double* @var64
-       %sin = call double @__sinpi(double %val) #0
-       %cos = call double @__cospi(double %val) #0
-       %res = fadd double %sin, %cos
-       ret double %res
-; CHECK-FLOAT-IN-VEC: [[VAL:%[a-z0-9]+]] = load double, double* @var64
-; CHECK-FLOAT-IN-VEC: [[SINCOS:%[a-z0-9]+]] = call { double, double } @__sincospi_stret(double [[VAL]])
-; CHECK-FLOAT-IN-VEC: extractvalue { double, double } [[SINCOS]], 0
-; CHECK-FLOAT-IN-VEC: extractvalue { double, double } [[SINCOS]], 1
-
-; CHECK: [[VAL:%[a-z0-9]+]] = load double, double* @var64
-; CHECK: [[SINCOS:%[a-z0-9]+]] = call { double, double } @__sincospi_stret(double [[VAL]])
-; CHECK: extractvalue { double, double } [[SINCOS]], 0
-; CHECK: extractvalue { double, double } [[SINCOS]], 1
-
-; CHECK-NO-SINCOS: call double @__sinpi
-; CHECK-NO-SINCOS: call double @__cospi
+; CHECK-FLOAT-IN-VEC-LABEL: @test_instbased_f64(
+; CHECK-FLOAT-IN-VEC-NEXT:    [[VAL:%.*]] = load double, double* @var64, align 8
+; CHECK-FLOAT-IN-VEC-NEXT:    [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double [[VAL]])
+; CHECK-FLOAT-IN-VEC-NEXT:    [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0
+; CHECK-FLOAT-IN-VEC-NEXT:    [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1
+; CHECK-FLOAT-IN-VEC-NEXT:    [[SIN:%.*]] = call double @__sinpi(double [[VAL]]) #[[ATTR0]]
+; CHECK-FLOAT-IN-VEC-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
+; CHECK-FLOAT-IN-VEC-NEXT:    [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]]
+; CHECK-FLOAT-IN-VEC-NEXT:    ret double [[RES]]
+;
+; CHECK-LABEL: @test_instbased_f64(
+; CHECK-NEXT:    [[VAL:%.*]] = load double, double* @var64, align 8
+; CHECK-NEXT:    [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double [[VAL]])
+; CHECK-NEXT:    [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0
+; CHECK-NEXT:    [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1
+; CHECK-NEXT:    [[SIN:%.*]] = call double @__sinpi(double [[VAL]]) #[[ATTR0]]
+; CHECK-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
+; CHECK-NEXT:    [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]]
+; CHECK-NEXT:    ret double [[RES]]
+;
+; CHECK-NO-SINCOS-LABEL: @test_instbased_f64(
+; CHECK-NO-SINCOS-NEXT:    [[VAL:%.*]] = load double, double* @var64, align 8
+; CHECK-NO-SINCOS-NEXT:    [[SIN:%.*]] = call double @__sinpi(double [[VAL]]) #[[ATTR0]]
+; CHECK-NO-SINCOS-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
+; CHECK-NO-SINCOS-NEXT:    [[RES:%.*]] = fadd double [[SIN]], [[COS]]
+; CHECK-NO-SINCOS-NEXT:    ret double [[RES]]
+;
+  %val = load double, double* @var64
+  %sin = call double @__sinpi(double %val) #0
+  %cos = call double @__cospi(double %val) #0
+  %res = fadd double %sin, %cos
+  ret double %res
+
+
 }
 
 define double @test_constant_f64() {
-       %sin = call double @__sinpi(double 1.0) #0
-       %cos = call double @__cospi(double 1.0) #0
-       %res = fadd double %sin, %cos
-       ret double %res
-; CHECK-FLOAT-IN-VEC: [[SINCOS:%[a-z0-9]+]] = call { double, double } @__sincospi_stret(double 1.000000e+00)
-; CHECK-FLOAT-IN-VEC: extractvalue { double, double } [[SINCOS]], 0
-; CHECK-FLOAT-IN-VEC: extractvalue { double, double } [[SINCOS]], 1
-
-; CHECK: [[SINCOS:%[a-z0-9]+]] = call { double, double } @__sincospi_stret(double 1.000000e+00)
-; CHECK: extractvalue { double, double } [[SINCOS]], 0
-; CHECK: extractvalue { double, double } [[SINCOS]], 1
-
-; CHECK-NO-SINCOS: call double @__sinpi
-; CHECK-NO-SINCOS: call double @__cospi
+; CHECK-FLOAT-IN-VEC-LABEL: @test_constant_f64(
+; CHECK-FLOAT-IN-VEC-NEXT:    [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double 1.000000e+00)
+; CHECK-FLOAT-IN-VEC-NEXT:    [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0
+; CHECK-FLOAT-IN-VEC-NEXT:    [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1
+; CHECK-FLOAT-IN-VEC-NEXT:    [[SIN:%.*]] = call double @__sinpi(double 1.000000e+00) #[[ATTR0]]
+; CHECK-FLOAT-IN-VEC-NEXT:    [[COS:%.*]] = call double @__cospi(double 1.000000e+00) #[[ATTR0]]
+; CHECK-FLOAT-IN-VEC-NEXT:    [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]]
+; CHECK-FLOAT-IN-VEC-NEXT:    ret double [[RES]]
+;
+; CHECK-LABEL: @test_constant_f64(
+; CHECK-NEXT:    [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double 1.000000e+00)
+; CHECK-NEXT:    [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0
+; CHECK-NEXT:    [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1
+; CHECK-NEXT:    [[SIN:%.*]] = call double @__sinpi(double 1.000000e+00) #[[ATTR0]]
+; CHECK-NEXT:    [[COS:%.*]] = call double @__cospi(double 1.000000e+00) #[[ATTR0]]
+; CHECK-NEXT:    [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]]
+; CHECK-NEXT:    ret double [[RES]]
+;
+; CHECK-NO-SINCOS-LABEL: @test_constant_f64(
+; CHECK-NO-SINCOS-NEXT:    [[SIN:%.*]] = call double @__sinpi(double 1.000000e+00) #[[ATTR0]]
+; CHECK-NO-SINCOS-NEXT:    [[COS:%.*]] = call double @__cospi(double 1.000000e+00) #[[ATTR0]]
+; CHECK-NO-SINCOS-NEXT:    [[RES:%.*]] = fadd double [[SIN]], [[COS]]
+; CHECK-NO-SINCOS-NEXT:    ret double [[RES]]
+;
+  %sin = call double @__sinpi(double 1.0) #0
+  %cos = call double @__cospi(double 1.0) #0
+  %res = fadd double %sin, %cos
+  ret double %res
+
+
 }
 
 define double @test_fptr(double (double)* %fptr, double %p1) {
-       %sin = call double @__sinpi(double %p1) #0
-       %cos = call double %fptr(double %p1)
-       %res = fadd double %sin, %cos
-       ret double %res
-; CHECK-LABEL: @test_fptr
-; CHECK: __sinpi
+; CHECK-FLOAT-IN-VEC-LABEL: @test_fptr(
+; CHECK-FLOAT-IN-VEC-NEXT:    [[SIN:%.*]] = call double @__sinpi(double [[P1:%.*]]) #[[ATTR0]]
+; CHECK-FLOAT-IN-VEC-NEXT:    [[COS:%.*]] = call double [[FPTR:%.*]](double [[P1]])
+; CHECK-FLOAT-IN-VEC-NEXT:    [[RES:%.*]] = fadd double [[SIN]], [[COS]]
+; CHECK-FLOAT-IN-VEC-NEXT:    ret double [[RES]]
+;
+; CHECK-LABEL: @test_fptr(
+; CHECK-NEXT:    [[SIN:%.*]] = call double @__sinpi(double [[P1:%.*]]) #[[ATTR0]]
+; CHECK-NEXT:    [[COS:%.*]] = call double [[FPTR:%.*]](double [[P1]])
+; CHECK-NEXT:    [[RES:%.*]] = fadd double [[SIN]], [[COS]]
+; CHECK-NEXT:    ret double [[RES]]
+;
+; CHECK-NO-SINCOS-LABEL: @test_fptr(
+; CHECK-NO-SINCOS-NEXT:    [[SIN:%.*]] = call double @__sinpi(double [[P1:%.*]]) #[[ATTR0]]
+; CHECK-NO-SINCOS-NEXT:    [[COS:%.*]] = call double [[FPTR:%.*]](double [[P1]])
+; CHECK-NO-SINCOS-NEXT:    [[RES:%.*]] = fadd double [[SIN]], [[COS]]
+; CHECK-NO-SINCOS-NEXT:    ret double [[RES]]
+;
+  %sin = call double @__sinpi(double %p1) #0
+  %cos = call double %fptr(double %p1)
+  %res = fadd double %sin, %cos
+  ret double %res
 }
diff --git a/llvm/test/Transforms/InstCombine/vec_extract_2elts.ll b/llvm/test/Transforms/InstCombine/vec_extract_2elts.ll
index 5972340d60a91d..f359dfe00a3b8a 100644
--- a/llvm/test/Transforms/InstCombine/vec_extract_2elts.ll
+++ b/llvm/test/Transforms/InstCombine/vec_extract_2elts.ll
@@ -1,12 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 define void @test(<4 x i32> %v, i64 *%r1, i64 *%r2) {
-;CHECK: %1 = extractelement <4 x i32> %v, i32 0
-;CHECK: %2 = zext i32 %1 to i64
-        %1 = zext <4 x i32> %v to <4 x i64>
-        %2 = extractelement <4 x i64> %1, i32 0
-        store i64 %2, i64 *%r1
-        store i64 %2, i64 *%r2
-        ret void
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[V:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    store i64 [[TMP2]], i64* [[R1:%.*]], align 4
+; CHECK-NEXT:    store i64 [[TMP2]], i64* [[R2:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = zext <4 x i32> %v to <4 x i64>
+  %2 = extractelement <4 x i64> %1, i32 0
+  store i64 %2, i64 *%r1
+  store i64 %2, i64 *%r2
+  ret void
 }
 
diff --git a/llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll
index e8c96d15bdb167..a792c6156240d3 100644
--- a/llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll
@@ -1,9 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 define void @f(i64 %val, i32  %limit, i32 *%ptr) {
-; CHECK-LABEL: @f
-; CHECK: %0 = trunc i64 %val to i32
-; CHECK: %1 = phi i32 [ %0, %entry ], [ {{.*}}, %loop ]
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[VAL:%.*]] to i32
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[END:%.*]] = icmp ult i32 [[TMP1]], [[LIMIT:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 10
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5]] = add i32 [[TMP1]], 16
+; CHECK-NEXT:    br i1 [[END]], label [[LOOP]], label [[RET:%.*]]
+; CHECK:       ret:
+; CHECK-NEXT:    ret void
+;
 entry:
   %tempvector = insertelement <16 x i64> poison, i64 %val, i32 0
   %vector = shufflevector <16 x i64> %tempvector, <16 x i64> poison, <16 x i32> zeroinitializer
@@ -27,9 +41,22 @@ ret:
 }
 
 define void @copy(i64 %val, i32  %limit, i32 *%ptr) {
-; CHECK-LABEL: @copy
-; CHECK: %0 = trunc i64 %val to i32
-; CHECK: %1 = phi i32 [ %0, %entry ], [ {{.*}}, %loop ]
+; CHECK-LABEL: @copy(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[VAL:%.*]] to i32
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[END:%.*]] = icmp ult i32 [[TMP1]], [[LIMIT:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 10
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5]] = add i32 [[TMP1]], 16
+; CHECK-NEXT:    br i1 [[END]], label [[LOOP]], label [[RET:%.*]]
+; CHECK:       ret:
+; CHECK-NEXT:    ret void
+;
 entry:
   %tempvector = insertelement <16 x i64> poison, i64 %val, i32 0
   %vector = shufflevector <16 x i64> %tempvector, <16 x i64> poison, <16 x i32> zeroinitializer
@@ -54,9 +81,27 @@ ret:
 }
 
 define void @nocopy(i64 %val, i32  %limit, i32 *%ptr) {
-; CHECK-LABEL: @nocopy
-; CHECK-NOT: phi i32
-; CHECK: phi <16 x i32> [ %3, %entry ], [ %inc, %loop ]
+; CHECK-LABEL: @nocopy(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[VAL:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i32> undef, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[TMP2]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <16 x i32> [ [[TMP3]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[ELT:%.*]] = extractelement <16 x i32> [[TMP4]], i32 0
+; CHECK-NEXT:    [[ELTCOPY:%.*]] = extractelement <16 x i32> [[TMP4]], i32 1
+; CHECK-NEXT:    [[END:%.*]] = icmp ult i32 [[ELT]], [[LIMIT:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[ELTCOPY]], 10
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[ELT]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    store i32 [[TMP5]], i32* [[TMP7]], align 4
+; CHECK-NEXT:    [[INC]] = add <16 x i32> [[TMP4]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    br i1 [[END]], label [[LOOP]], label [[RET:%.*]]
+; CHECK:       ret:
+; CHECK-NEXT:    ret void
+;
 entry:
   %tempvector = insertelement <16 x i64> poison, i64 %val, i32 0
   %vector = shufflevector <16 x i64> %tempvector, <16 x i64> poison, <16 x i32> zeroinitializer
@@ -81,8 +126,23 @@ ret:
 }
 
 define i1 @g(<3 x i32> %input_2) {
-; CHECK-LABEL: @g
-; CHECK: extractelement <3 x i32> %input_2, i32 0
+; CHECK-LABEL: @g(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x i32> [[INPUT_2:%.*]], i32 0
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP4:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[TMP3:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br i1 undef, label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP3]] = add i32 [[TMP2]], -1
+; CHECK-NEXT:    [[SUB44_ELT:%.*]] = sub i32 0, [[TMP2]]
+; CHECK-NEXT:    [[TMP4]] = sdiv i32 [[TMP1]], [[SUB44_ELT]]
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[TOBOOL313:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[TOBOOL313]]
+;
 entry:
   br label %for.cond
 
@@ -91,7 +151,6 @@ for.cond:
   %input_1.addr.1 = phi <3 x i32> [ undef, %entry ], [ %dec43, %for.body ]
   br i1 undef, label %for.end, label %for.body
 
-; CHECK-NOT: extractelement <3 x i32> %{{.*}}, i32 0
 for.body:
   %dec43 = add <3 x i32> %input_1.addr.1, <i32 -1, i32 -1, i32 -1>
   %sub44 = sub <3 x i32> <i32 -1, i32 -1, i32 -1>, %dec43
diff --git a/llvm/test/Transforms/InstCombine/vec_phi_extract.ll b/llvm/test/Transforms/InstCombine/vec_phi_extract.ll
index 15eb94aad6943a..3aacc0f8203526 100644
--- a/llvm/test/Transforms/InstCombine/vec_phi_extract.ll
+++ b/llvm/test/Transforms/InstCombine/vec_phi_extract.ll
@@ -1,9 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 define void @f(i64 %val, i32  %limit, i32 *%ptr) {
-; CHECK-LABEL: @f
-; CHECK: %0 = trunc i64 %val to i32
-; CHECK: %1 = phi i32 [ %0, %entry ], [ {{.*}}, %loop ]
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[VAL:%.*]] to i32
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[END:%.*]] = icmp ult i32 [[TMP1]], [[LIMIT:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 10
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5]] = add i32 [[TMP1]], 16
+; CHECK-NEXT:    br i1 [[END]], label [[LOOP]], label [[RET:%.*]]
+; CHECK:       ret:
+; CHECK-NEXT:    ret void
+;
 entry:
   %tempvector = insertelement <16 x i64> undef, i64 %val, i32 0
   %vector = shufflevector <16 x i64> %tempvector, <16 x i64> undef, <16 x i32> zeroinitializer
@@ -27,9 +41,22 @@ ret:
 }
 
 define void @copy(i64 %val, i32  %limit, i32 *%ptr) {
-; CHECK-LABEL: @copy
-; CHECK: %0 = trunc i64 %val to i32
-; CHECK: %1 = phi i32 [ %0, %entry ], [ {{.*}}, %loop ]
+; CHECK-LABEL: @copy(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[VAL:%.*]] to i32
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[END:%.*]] = icmp ult i32 [[TMP1]], [[LIMIT:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 10
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5]] = add i32 [[TMP1]], 16
+; CHECK-NEXT:    br i1 [[END]], label [[LOOP]], label [[RET:%.*]]
+; CHECK:       ret:
+; CHECK-NEXT:    ret void
+;
 entry:
   %tempvector = insertelement <16 x i64> undef, i64 %val, i32 0
   %vector = shufflevector <16 x i64> %tempvector, <16 x i64> undef, <16 x i32> zeroinitializer
@@ -54,9 +81,27 @@ ret:
 }
 
 define void @nocopy(i64 %val, i32  %limit, i32 *%ptr) {
-; CHECK-LABEL: @nocopy
-; CHECK-NOT: phi i32
-; CHECK: phi <16 x i32> [ %3, %entry ], [ %inc, %loop ]
+; CHECK-LABEL: @nocopy(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[VAL:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i32> undef, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[TMP2]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <16 x i32> [ [[TMP3]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[ELT:%.*]] = extractelement <16 x i32> [[TMP4]], i32 0
+; CHECK-NEXT:    [[ELTCOPY:%.*]] = extractelement <16 x i32> [[TMP4]], i32 1
+; CHECK-NEXT:    [[END:%.*]] = icmp ult i32 [[ELT]], [[LIMIT:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[ELTCOPY]], 10
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[ELT]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    store i32 [[TMP5]], i32* [[TMP7]], align 4
+; CHECK-NEXT:    [[INC]] = add <16 x i32> [[TMP4]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    br i1 [[END]], label [[LOOP]], label [[RET:%.*]]
+; CHECK:       ret:
+; CHECK-NEXT:    ret void
+;
 entry:
   %tempvector = insertelement <16 x i64> undef, i64 %val, i32 0
   %vector = shufflevector <16 x i64> %tempvector, <16 x i64> undef, <16 x i32> zeroinitializer
@@ -81,8 +126,23 @@ ret:
 }
 
 define i1 @g(<3 x i32> %input_2) {
-; CHECK-LABEL: @g
-; CHECK: extractelement <3 x i32> %input_2, i32 0
+; CHECK-LABEL: @g(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x i32> [[INPUT_2:%.*]], i32 0
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP4:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[TMP3:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br i1 undef, label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP3]] = add i32 [[TMP2]], -1
+; CHECK-NEXT:    [[SUB44_ELT:%.*]] = sub i32 0, [[TMP2]]
+; CHECK-NEXT:    [[TMP4]] = sdiv i32 [[TMP1]], [[SUB44_ELT]]
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[TOBOOL313:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[TOBOOL313]]
+;
 entry:
   br label %for.cond
 
@@ -91,7 +151,6 @@ for.cond:
   %input_1.addr.1 = phi <3 x i32> [ undef, %entry ], [ %dec43, %for.body ]
   br i1 undef, label %for.end, label %for.body
 
-; CHECK-NOT: extractelement <3 x i32> %{{.*}}, i32 0
 for.body:
   %dec43 = add <3 x i32> %input_1.addr.1, <i32 -1, i32 -1, i32 -1>
   %sub44 = sub <3 x i32> <i32 -1, i32 -1, i32 -1>, %dec43
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
index 6c82f9dc4c6381..90f249fa18eab0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
 ; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST
 ; RUN: opt < %s -loop-vectorize -force-vector-width=2 -instcombine -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
@@ -8,72 +9,79 @@ target triple = "aarch64--linux-gnu"
 ; This test checks that we correctly compute the scalarized operands for a
 ; user-specified vectorization factor when interleaving is disabled. We use the
 ; "optsize" attribute to disable all interleaving calculations.  A cost of 4
-; for %tmp4 indicates that we would scalarize it's operand (%tmp3), giving
-; %tmp4 a lower scalarization overhead.
+; for %var4 indicates that we would scalarize it's operand (%var3), giving
+; %var4 a lower scalarization overhead.
 ;
 ; COST-LABEL:  predicated_udiv_scalarized_operand
-; COST:        LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3
+; COST:        LV: Found an estimated cost of 4 for VF 2 For instruction: %var4 = udiv i64 %var2, %var3
 ;
+;
+define i64 @predicated_udiv_scalarized_operand(i64* %a, i64 %x) optsize {
 ; CHECK-LABEL: @predicated_udiv_scalarized_operand(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %entry ], [ [[INDEX_NEXT:%.*]], %[[PRED_UDIV_CONTINUE2:.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %entry ], [ [[TMP17:%.*]], %[[PRED_UDIV_CONTINUE2]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, i64* %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP17:%.*]], [[PRED_UDIV_CONTINUE2]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]]
-; CHECK:       [[PRED_UDIV_IF]]:
+; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
+; CHECK:       pred.udiv.if:
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], %x
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], [[X:%.*]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
-; CHECK-NEXT:    br label %[[PRED_UDIV_CONTINUE]]
-; CHECK:       [[PRED_UDIV_CONTINUE]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i64> [ poison, %vector.body ], [ [[TMP8]], %[[PRED_UDIV_IF]] ]
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE]]
+; CHECK:       pred.udiv.continue:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i64> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UDIV_IF]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
-; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2]]
-; CHECK:       [[PRED_UDIV_IF1]]:
+; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]]
+; CHECK:       pred.udiv.if1:
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = add nsw i64 [[TMP11]], %x
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw i64 [[TMP11]], [[X]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = udiv i64 [[TMP13]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP14]], i32 1
-; CHECK-NEXT:    br label %[[PRED_UDIV_CONTINUE2]]
-; CHECK:       [[PRED_UDIV_CONTINUE2]]:
-; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x i64> [ [[TMP9]], %[[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], %[[PRED_UDIV_IF1]] ]
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
+; CHECK:       pred.udiv.continue2:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x i64> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], [[PRED_UDIV_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP16]], <2 x i64> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP17]] = add <2 x i64> [[VEC_PHI]], [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP17]])
+; CHECK-NEXT:    ret i64 [[TMP19]]
 ;
-define i64 @predicated_udiv_scalarized_operand(i64* %a, i64 %x) optsize {
 entry:
   br label %for.body
 
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
-  %r = phi i64 [ 0, %entry ], [ %tmp6, %for.inc ]
-  %tmp0 = getelementptr inbounds i64, i64* %a, i64 %i
-  %tmp2 = load i64, i64* %tmp0, align 4
-  %cond0 = icmp sgt i64 %tmp2, 0
+  %r = phi i64 [ 0, %entry ], [ %var6, %for.inc ]
+  %var0 = getelementptr inbounds i64, i64* %a, i64 %i
+  %var2 = load i64, i64* %var0, align 4
+  %cond0 = icmp sgt i64 %var2, 0
   br i1 %cond0, label %if.then, label %for.inc
 
 if.then:
-  %tmp3 = add nsw i64 %tmp2, %x
-  %tmp4 = udiv i64 %tmp2, %tmp3
+  %var3 = add nsw i64 %var2, %x
+  %var4 = udiv i64 %var2, %var3
   br label %for.inc
 
 for.inc:
-  %tmp5 = phi i64 [ %tmp2, %for.body ], [ %tmp4, %if.then]
-  %tmp6 = add i64 %r, %tmp5
+  %var5 = phi i64 [ %var2, %for.body ], [ %var4, %if.then]
+  %var6 = add i64 %r, %var5
   %i.next = add nuw nsw i64 %i, 1
   %cond1 = icmp slt i64 %i.next, 100
   br i1 %cond1, label %for.body, label %for.end
 
 for.end:
-  %tmp7 = phi i64 [ %tmp6, %for.inc ]
-  ret i64 %tmp7
+  %var7 = phi i64 [ %var6, %for.inc ]
+  ret i64 %var7
 }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
index 98acd07df33849..a2912dc58c1dd5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
@@ -1,18 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -scalable-vectorization=on -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S %s -o - | FileCheck %s
 
 define void @cond_inv_load_i32i32i16(i32* noalias nocapture %a, i32* noalias nocapture readonly %cond, i16* noalias nocapture readonly %inv, i64 %n) #0 {
-; CHECK-LABEL: @cond_inv_load_i32i32i16
-; CHECK:     vector.ph:
-; CHECK:       %[[INVINS:.*]] = insertelement <vscale x 4 x i16*> poison, i16* %inv, i32 0
-; CHECK:       %[[INVSPLAT:.*]] = shufflevector <vscale x 4 x i16*> %[[INVINS]], <vscale x 4 x i16*> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK:     vector.body:
-; CHECK:       %[[GEPCOND:.*]] = getelementptr inbounds i32, i32* %cond, i64 %index
-; CHECK-NEXT:  %[[GEPCOND2:.*]] = bitcast i32* %[[GEPCOND]] to <vscale x 4 x i32>*
-; CHECK-NEXT:  %[[CONDVALS:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* %[[GEPCOND2]], align 4
-; CHECK-NEXT:  %[[MASK:.*]] = icmp ne <vscale x 4 x i32> %[[CONDVALS]],
-; CHECK-NEXT:  %[[GATHERLOAD:.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> %[[INVSPLAT]], i32 2, <vscale x 4 x i1> %[[MASK]], <vscale x 4 x i16> undef)
-; CHECK-NEXT:  %[[GATHERLOAD2:.*]] = sext <vscale x 4 x i16> %[[GATHERLOAD]] to <vscale x 4 x i32>
-; CHECK:       call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[GATHERLOAD2]]
+; CHECK-LABEL: @cond_inv_load_i32i32i16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16*> poison, i16* [[INV:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16*> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16*> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> [[BROADCAST_SPLAT]], i32 2, <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i16> undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[TMP7]], <vscale x 4 x i32>* [[TMP9]], i32 4, <vscale x 4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[I_07]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP14:%.*]] = load i16, i16* [[INV]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP14]] to i32
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_07]]
+; CHECK-NEXT:    store i32 [[CONV]], i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_07]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -40,17 +84,59 @@ exit:                        ; preds = %for.inc
 }
 
 define void @cond_inv_load_f64f64f64(double* noalias nocapture %a, double* noalias nocapture readonly %cond, double* noalias nocapture readonly %inv, i64 %n) #0 {
-; CHECK-LABEL: @cond_inv_load_f64f64f64
-; CHECK:     vector.ph:
-; CHECK:       %[[INVINS:.*]] = insertelement <vscale x 4 x double*> poison, double* %inv, i32 0
-; CHECK:       %[[INVSPLAT:.*]] = shufflevector <vscale x 4 x double*> %[[INVINS]], <vscale x 4 x double*> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK:     vector.body:
-; CHECK:       %[[GEPCOND:.*]] = getelementptr inbounds double, double* %cond, i64 %index
-; CHECK-NEXT:  %[[GEPCOND2:.*]] = bitcast double* %[[GEPCOND]] to <vscale x 4 x double>*
-; CHECK-NEXT:  %[[CONDVALS:.*]] = load <vscale x 4 x double>, <vscale x 4 x double>* %[[GEPCOND2]], align 8
-; CHECK-NEXT:  %[[MASK:.*]] = fcmp ogt <vscale x 4 x double> %[[CONDVALS]],
-; CHECK-NEXT:  %[[GATHERLOAD:.*]] = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> %[[INVSPLAT]], i32 8, <vscale x 4 x i1> %[[MASK]], <vscale x 4 x double> undef)
-; CHECK:       call void @llvm.masked.store.nxv4f64.p0nxv4f64(<vscale x 4 x double> %[[GATHERLOAD]]
+; CHECK-LABEL: @cond_inv_load_f64f64f64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x double*> poison, double* [[INV:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x double*> [[BROADCAST_SPLATINSERT]], <vscale x 4 x double*> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[COND:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP4]] to <vscale x 4 x double>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x double>, <vscale x 4 x double>* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <vscale x 4 x double> [[WIDE_LOAD]], shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 4.000000e-01, i32 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> [[BROADCAST_SPLAT]], i32 8, <vscale x 4 x i1> [[TMP6]], <vscale x 4 x double> undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP7]] to <vscale x 4 x double>*
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f64.p0nxv4f64(<vscale x 4 x double> [[WIDE_MASKED_GATHER]], <vscale x 4 x double>* [[TMP8]], i32 8, <vscale x 4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[COND]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load double, double* [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ogt double [[TMP12]], 4.000000e-01
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP13:%.*]] = load double, double* [[INV]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[I_08]]
+; CHECK-NEXT:    store double [[TMP13]], double* [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -77,17 +163,68 @@ exit:                        ; preds = %for.inc
 }
 
 define void @invariant_load_cond(i32* noalias nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %cond, i64 %n) #0 {
-; CHECK-LABEL: @invariant_load_cond
-; CHECK: vector.body
-; CHECK: %[[GEP:.*]] = getelementptr inbounds i32, i32* %b, i64 42
-; CHECK-NEXT: %[[SPLATINS:.*]] = insertelement <vscale x 4 x i32*> poison, i32* %[[GEP]], i32 0
-; CHECK-NEXT: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32*> %[[SPLATINS]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK: %[[LOAD:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>*
-; CHECK-NEXT: %[[ICMP:.*]] = icmp ne <vscale x 4 x i32> %[[LOAD]], zeroinitializer
-; CHECK: %[[MASKED_LOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* %[[BITCAST:.*]], i32 4, <vscale x 4 x i1> %[[ICMP]], <vscale x 4 x i32> poison)
-; CHECK-NEXT: %[[MASKED_GATHER:.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %[[SPLAT]], i32 4, <vscale x 4 x i1> %[[ICMP]], <vscale x 4 x i32> undef)
-; CHECK-NEXT: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[MASKED_GATHER]], %[[MASKED_LOAD]]
-; CHECK: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[ADD]], <vscale x 4 x i32>* %[[BITCAST1:.*]], i32 4, <vscale x 4 x i1> %[[ICMP]])
+; CHECK-LABEL: @invariant_load_cond(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 42
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32*> poison, i32* [[TMP4]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32*> [[DOTSPLATINSERT]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP9]], i32 4, <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[DOTSPLAT]], i32 4, <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[TMP7]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP16]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 42
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
index e5bbcd881163fb..70b665d82765ba 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
@@ -1,12 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S %s -scalable-vectorization=preferred -force-target-instruction-cost=1 -o - | FileCheck %s
 
 define void @gather_nxv4i32_ind64(float* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, float* noalias nocapture %c, i64 %n) #0 {
-; CHECK-LABEL: @gather_nxv4i32_ind64
-; CHECK: vector.body:
-; CHECK:   %[[IND:.*]] = load <vscale x 4 x i64>, <vscale x 4 x i64>*
-; CHECK:   %[[PTRS:.*]] = getelementptr inbounds float, float* %a, <vscale x 4 x i64> %[[IND]]
-; CHECK:   %[[GLOAD:.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> %[[PTRS]]
-; CHECK:   store <vscale x 4 x float> %[[GLOAD]], <vscale x 4 x float>*
+; CHECK-LABEL: @gather_nxv4i32_ind64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <vscale x 4 x i64>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i64>, <vscale x 4 x i64>* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[A:%.*]], <vscale x 4 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> [[TMP6]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 4 x float>*
+; CHECK-NEXT:    store <vscale x 4 x float> [[WIDE_MASKED_GATHER]], <vscale x 4 x float>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[TMP13]], float* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -30,13 +72,55 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
 ; additional 'sext' in the for.body loop exposes additional code paths
 ; during vectorisation.
 define void @scatter_nxv4i32_ind32(float* noalias nocapture %a, i32* noalias nocapture readonly %b, float* noalias nocapture readonly %c, i64 %n) #0 {
-; CHECK-LABEL: @scatter_nxv4i32_ind32
-; CHECK: vector.body:
-; CHECK:   %[[VALS:.*]] = load <vscale x 4 x float>
-; CHECK:   %[[IND:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* %7, align 4
-; CHECK:   %[[EXTIND:.*]] = sext <vscale x 4 x i32> %[[IND]] to <vscale x 4 x i64>
-; CHECK:   %[[PTRS:.*]] = getelementptr inbounds float, float* %a, <vscale x 4 x i64> %[[EXTIND]]
-; CHECK:   call void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> %[[VALS]], <vscale x 4 x float*> %[[PTRS]]
+; CHECK-LABEL: @scatter_nxv4i32_ind32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP4]] to <vscale x 4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD1]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[A:%.*]], <vscale x 4 x i64> [[TMP8]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float*> [[TMP9]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[TMP14]] to i64
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IDXPROM4]]
+; CHECK-NEXT:    store float [[TMP13]], float* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -58,14 +142,54 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
 }
 
 define void @scatter_inv_nxv4i32(i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) #0 {
-; CHECK-LABEL: @scatter_inv_nxv4i32
-; CHECK: vector.ph:
-; CHECK:   %[[INS:.*]] = insertelement <vscale x 4 x i32*> poison, i32* %inv, i32 0
-; CHECK:   %[[PTRSPLAT:.*]] = shufflevector <vscale x 4 x i32*> %[[INS]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK: vector.body:
-; CHECK:   %[[VALS:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* %5, align 4
-; CHECK:   %[[MASK:.*]] = icmp ne <vscale x 4 x i32> %[[VALS]],
-; CHECK:   call void @llvm.masked.scatter.nxv4i32.nxv4p0i32({{.*}}, <vscale x 4 x i32*> %[[PTRSPLAT]], i32 4, <vscale x 4 x i1> %[[MASK]])
+; CHECK-LABEL: @scatter_inv_nxv4i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32*> poison, i32* [[INV:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32*> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32*> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i32 3, i32* [[INV]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -90,14 +214,57 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
 }
 
 define void @gather_inv_nxv4i32(i32* noalias nocapture %a, i32* noalias nocapture readonly %inv, i64 %n) #0 {
-; CHECK-LABEL: @gather_inv_nxv4i32
-; CHECK: vector.ph:
-; CHECK:   %[[INS:.*]] = insertelement <vscale x 4 x i32*> poison, i32* %inv, i32 0
-; CHECK:   %[[PTRSPLAT:.*]] = shufflevector <vscale x 4 x i32*> %[[INS]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK: vector.body:
-; CHECK:   %[[VALS:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* %5, align 4
-; CHECK:   %[[MASK:.*]] = icmp sgt <vscale x 4 x i32> %[[VALS]],
-; CHECK:   %{{.*}} = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %[[PTRSPLAT]], i32 4, <vscale x 4 x i1> %[[MASK]]
+; CHECK-LABEL: @gather_inv_nxv4i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32*> poison, i32* [[INV:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32*> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP4]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i32>* [[TMP7]], i32 4, <vscale x 4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[TMP11]], 3
+; CHECK-NEXT:    br i1 [[CMP2]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[INV]], align 4
+; CHECK-NEXT:    store i32 [[TMP12]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -125,16 +292,69 @@ for.cond.cleanup:                                 ; preds = %for.inc, %entry
 
 
 define void @gather_nxv4i32_ind64_stride2(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
-; CHECK-LABEL: @gather_nxv4i32_ind64_stride2
-; CHECK: vector.body:
-; CHECK:      %[[IDX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ]
-; CHECK-DAG:  %[[STEP:.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-DAG:  %[[IDXSPLATINS:.*]] = insertelement <vscale x 4 x i64> poison, i64 %[[IDX]], i32 0
-; CHECK-DAG:  %[[IDXSPLAT:.*]] = shufflevector <vscale x 4 x i64> %[[IDXSPLATINS]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK:      %[[ADD:.*]] = add <vscale x 4 x i64> %[[IDXSPLAT]], %[[STEP]]
-; CHECK:      %[[MUL:.*]] = shl <vscale x 4 x i64> %[[ADD]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK:      %[[PTRS:.*]] = getelementptr inbounds float, float* %b, <vscale x 4 x i64> %[[MUL]]
-; CHECK:      call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> %[[PTRS]]
+; CHECK-LABEL: @gather_nxv4i32_ind64_stride2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT2]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
+; CHECK-NEXT:    [[DOTSPLATINSERT4:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP7]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT4]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 4 x i64> [[TMP5]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = shl <vscale x 4 x i64> [[TMP9]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[B:%.*]], <vscale x 4 x i64> [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[B]], <vscale x 4 x i64> [[TMP11]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> [[TMP12]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <vscale x 4 x float>*
+; CHECK-NEXT:    store <vscale x 4 x float> [[WIDE_MASKED_GATHER]], <vscale x 4 x float>* [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP17:%.*]] = shl nuw nsw i32 [[TMP16]], 2
+; CHECK-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast float* [[TMP19]] to <vscale x 4 x float>*
+; CHECK-NEXT:    store <vscale x 4 x float> [[WIDE_MASKED_GATHER6]], <vscale x 4 x float>* [[TMP20]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP22:%.*]] = shl nuw nsw i64 [[TMP21]], 3
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[INDVARS_IV_STRIDE2:%.*]] = shl i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_STRIDE2]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[TMP24]], float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
index e5af52c5784781..19c8b1525bc91d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -scalable-vectorization=on -force-target-instruction-cost=1 -dce -instcombine < %s -S | FileCheck %s
 
 target triple = "aarch64-linux-gnu"
@@ -11,15 +12,59 @@ target triple = "aarch64-linux-gnu"
 
 define void @cond_ind64(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) #0 {
 ; CHECK-LABEL: @cond_ind64(
-; CHECK:    vector.body:
-; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ]
-; CHECK:      %[[STEPVEC:.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT: %[[TMP1:.*]] = insertelement <vscale x 4 x i64> poison, i64 %[[INDEX]], i32 0
-; CHECK-NEXT: %[[IDXSPLT:.*]] = shufflevector <vscale x 4 x i64> %[[TMP1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: %[[VECIND:.*]] = add <vscale x 4 x i64> %[[IDXSPLT]], %[[STEPVEC]]
-; CHECK-NEXT: %[[MASK:.*]] = trunc <vscale x 4 x i64> %[[VECIND]] to <vscale x 4 x i1>
-; CHECK:      %[[LOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* %{{.*}}, i32 4, <vscale x 4 x i1> %[[MASK]], <vscale x 4 x i32> poison)
-; CHECK:      call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[LOAD]], <vscale x 4 x i32>* %{{.*}}, i32 4, <vscale x 4 x i1> %[[MASK]])
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc <vscale x 4 x i64> [[TMP5]] to <vscale x 4 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32>* [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP12:%.*]] = shl i64 [[TMP11]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[I_08]], 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i64 [[AND]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_08]]
+; CHECK-NEXT:    store i32 [[TMP14]], i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index a11ac0e87b6284..3aca292ebd3c73 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -scalable-vectorization=on -dce -instcombine -S < %s | FileCheck %s
 
 ; Ensure that we can vectorize loops such as:
@@ -14,9 +15,23 @@
 
 define void @widen_ptr_phi_unrolled(i32* noalias nocapture %a, i32* noalias nocapture %b, i32* nocapture readonly %c, i64 %n) #0 {
 ; CHECK-LABEL: @widen_ptr_phi_unrolled(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i32, i32* [[C:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i32* [ %c, %vector.ph ], [ %[[PTR_IND:.*]], %vector.body ]
-; CHECK:         [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i32* [ [[C]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
@@ -30,11 +45,64 @@ define void @widen_ptr_phi_unrolled(i32* noalias nocapture %a, i32* noalias noca
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], <vscale x 4 x i64> [[VECTOR_GEP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, <vscale x 4 x i32*> [[TMP9]], i64 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, <vscale x 4 x i32*> [[TMP12]], i64 1
-; CHECK-NEXT:    {{%.*}} = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP9]],
-; CHECK-NEXT:    {{%.*}} = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP12]],
-; CHECK-NEXT:    {{%.*}} = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP13]],
-; CHECK-NEXT:    {{%.*}} = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP14]],
-; CHECK:         [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP7]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP9]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP12]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP14]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP16:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP15]], <vscale x 4 x i32>* [[TMP18]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP20:%.*]] = shl nuw nsw i32 [[TMP19]], 2
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP20]] to i64
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP16]], <vscale x 4 x i32>* [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER6]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP25:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i32>* [[TMP27]], align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP29:%.*]] = shl nuw nsw i32 [[TMP28]], 2
+; CHECK-NEXT:    [[TMP30:%.*]] = zext i32 [[TMP29]] to i64
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[TMP26]], i64 [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP25]], <vscale x 4 x i32>* [[TMP32]], align 4
+; CHECK-NEXT:    [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP34:%.*]] = shl nuw nsw i64 [[TMP33]], 3
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP34]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[C]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[PTR_014:%.*]] = phi i32* [ [[INCDEC_PTR1:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[PTR_014]], i64 1
+; CHECK-NEXT:    [[TMP36:%.*]] = load i32, i32* [[PTR_014]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i32, i32* [[PTR_014]], i64 2
+; CHECK-NEXT:    [[TMP37:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP36]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_013]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP37]], 1
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_013]]
+; CHECK-NEXT:    store i32 [[ADD2]], i32* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_013]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -73,27 +141,69 @@ for.exit:                                 ; preds = %for.body
 
 define void @widen_2ptrs_phi_unrolled(i32* noalias nocapture %dst, i32* noalias nocapture readonly %src, i64 %n) #0 {
 ; CHECK-LABEL: @widen_2ptrs_phi_unrolled(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[N_VEC]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    %[[IDX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ]
-; CHECK-NEXT:    %[[LGEP1:.*]] = getelementptr i32, i32* %src, i64 %[[IDX]]
-; CHECK-NEXT:    %[[SGEP1:.*]] = getelementptr i32, i32* %dst, i64 %[[IDX]]
-; CHECK-NEXT:    %[[LPTR1:.*]] = bitcast i32* %[[LGEP1]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    %{{.*}} = load <vscale x 4 x i32>, <vscale x 4 x i32>* %[[LPTR1]], align 4
-; CHECK-NEXT:    %[[VSCALE1:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    %[[TMP1:.*]] = shl nuw nsw i32 %[[VSCALE1]], 2
-; CHECK-NEXT:    %[[TMP2:.*]] = zext i32 %[[TMP1]] to i64
-; CHECK-NEXT:    %[[LGEP2:.*]] = getelementptr i32, i32* %[[LGEP1]], i64 %[[TMP2]]
-; CHECK-NEXT:    %[[LPTR2:.*]] = bitcast i32* %[[LGEP2]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    %{{.*}} = load <vscale x 4 x i32>, <vscale x 4 x i32>* %[[LPTR2]], align 4
-; CHECK:         %[[SPTR1:.*]] = bitcast i32* %[[SGEP1]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    store <vscale x 4 x i32> %{{.*}}, <vscale x 4 x i32>* %[[SPTR1]], align 4
-; CHECK-NEXT:    %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    %[[TMP3:.*]] = shl nuw nsw i32 %[[VSCALE2]], 2
-; CHECK-NEXT:    %[[TMP4:.*]] = zext i32 %[[TMP3]] to i64
-; CHECK-NEXT:    %[[SGEP2:.*]] = getelementptr i32, i32* %[[SGEP1]], i64 %[[TMP4]]
-; CHECK-NEXT:    %[[SPTR2:.*]] = bitcast i32* %[[SGEP2]] to <vscale x 4 x i32>*
-; CHECK-NEXT:    store <vscale x 4 x i32> %{{.*}}, <vscale x 4 x i32>* %[[SPTR2]], align 4
-
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i32, i32* [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[NEXT_GEP]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i32 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[NEXT_GEP]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nsw <vscale x 4 x i32> [[WIDE_LOAD7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[NEXT_GEP5]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32>* [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP14:%.*]] = shl nuw nsw i32 [[TMP13]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i32, i32* [[NEXT_GEP5]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32>* [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i64 [[TMP18]], 3
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32* [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[DST]], [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[S_010:%.*]] = phi i32* [ [[INCDEC_PTR1:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[D_09:%.*]] = phi i32* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[S_010]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP21]], 1
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[D_09]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[D_09]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i32, i32* [[S_010]], i64 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_011]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -124,15 +234,29 @@ for.cond.cleanup:                                 ; preds = %for.body
 
 define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) #0 {
 ; CHECK-LABEL: @pointer_iv_mixed(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i32*, i32** [[B:%.*]], i64 [[N_VEC]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i32* [ %a, %vector.ph ], [ [[PTR_IND:%.*]], %vector.body ]
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ insertelement (<vscale x 2 x i32> zeroinitializer, i32 0, i32 0), %vector.ph ], [ [[TMP9:%.*]], %vector.body ]
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i32* [ [[A]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ insertelement (<vscale x 2 x i32> zeroinitializer, i32 0, i32 0), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], <vscale x 2 x i64> [[TMP6]]
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32*, i32** %b, i64 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32*, i32** [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[BC:%.*]] = bitcast <vscale x 2 x i32*> [[TMP7]] to <vscale x 2 x <vscale x 2 x i32>*>
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x <vscale x 2 x i32>*> [[BC]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, <vscale x 2 x i32>* [[TMP8]], align 8
@@ -142,36 +266,72 @@ define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}}
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP5]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[TMP9]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32** [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[P:%.*]] = phi i32* [ [[VAR3:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[Q:%.*]] = phi i32** [ [[VAR4:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[VAR0:%.*]] = phi i32 [ [[VAR2:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[VAR1:%.*]] = load i32, i32* [[P]], align 8
+; CHECK-NEXT:    [[VAR2]] = add i32 [[VAR1]], [[VAR0]]
+; CHECK-NEXT:    store i32* [[P]], i32** [[Q]], align 8
+; CHECK-NEXT:    [[VAR3]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; CHECK-NEXT:    [[VAR4]] = getelementptr inbounds i32*, i32** [[Q]], i64 1
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR2]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[VAR5]]
+;
 entry:
   br label %for.body
 
 for.body:
   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-  %p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ]
-  %q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ]
-  %tmp0 = phi i32 [ %tmp2, %for.body ], [ 0, %entry ]
-  %tmp1 = load i32, i32* %p, align 8
-  %tmp2 = add i32 %tmp1, %tmp0
+  %p = phi i32* [ %var3, %for.body ], [ %a, %entry ]
+  %q = phi i32** [ %var4, %for.body ], [ %b, %entry ]
+  %var0 = phi i32 [ %var2, %for.body ], [ 0, %entry ]
+  %var1 = load i32, i32* %p, align 8
+  %var2 = add i32 %var1, %var0
   store i32* %p, i32** %q, align 8
-  %tmp3 = getelementptr inbounds i32, i32* %p, i32 1
-  %tmp4 = getelementptr inbounds i32*, i32** %q, i32 1
+  %var3 = getelementptr inbounds i32, i32* %p, i32 1
+  %var4 = getelementptr inbounds i32*, i32** %q, i32 1
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end, !llvm.loop !6
 
 for.end:
-  %tmp5 = phi i32 [ %tmp2, %for.body ]
-  ret i32 %tmp5
+  %var5 = phi i32 [ %var2, %for.body ]
+  ret i32 %var5
 }
 
 define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(i16* %ptr) #0 {
 ; CHECK-LABEL: @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i16, i16* [[PTR:%.*]], i64 [[N_VEC]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i16* [ %ptr, %vector.ph ], [ [[PTR_IND:%.*]], %vector.body ]
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i16* [ [[PTR]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
@@ -183,9 +343,32 @@ define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(i16*
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}}
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i16, i16* [[POINTER_PHI]], i64 [[TMP3]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i16* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV_PTR:%.*]] = phi i16* [ [[INCDEC_IV_PTR:%.*]], [[IF_END]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[CMP_I_NOT:%.*]] = icmp eq i16* [[IV_PTR]], null
+; CHECK-NEXT:    br i1 [[CMP_I_NOT]], label [[IF_END]], label [[IF_END_SINK_SPLIT:%.*]]
+; CHECK:       if.end.sink.split:
+; CHECK-NEXT:    store i16 0, i16* [[IV_PTR]], align 2
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[INCDEC_IV_PTR]] = getelementptr inbounds i16, i16* [[IV_PTR]], i64 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp ult i64 [[IV]], 1023
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
index 5c7bd79d751fe8..012106fa50e52d 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S  -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -dce \
 ; RUN:   -instcombine -force-vector-width=2  < %s | FileCheck %s
 ;
@@ -7,15 +8,32 @@
 ; Check that the addresses for a scalarized memory access is not extracted
 ; from a vector register.
 define i32 @foo(i32* nocapture %A) {
-;CHECK-LABEL: @foo(
-;CHECK:  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-;CHECK:  %0 = shl nsw i64 %index, 2
-;CHECK:  %1 = shl i64 %index, 2
-;CHECK:  %2 = or i64 %1, 4
-;CHECK:  %3 = getelementptr inbounds i32, i32* %A, i64 %0
-;CHECK:  %4 = getelementptr inbounds i32, i32* %A, i64 %2
-;CHECK:  store i32 4, i32* %3, align 4
-;CHECK:  store i32 4, i32* %4, align 4
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    store i32 4, i32* [[TMP3]], align 4
+; CHECK-NEXT:    store i32 4, i32* [[TMP4]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 undef
+;
 
 entry:
   br label %for.body
@@ -37,20 +55,37 @@ for.end:
 
 ; Check that a load of address is scalarized.
 define i32 @foo1(i32* nocapture noalias %A, i32** nocapture %PtrPtr) {
-;CHECK-LABEL: @foo1(
-;CHECK:  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-;CHECK:  %0 = or i64 %index, 1
-;CHECK:  %1 = getelementptr inbounds i32*, i32** %PtrPtr, i64 %index
-;CHECK:  %2 = getelementptr inbounds i32*, i32** %PtrPtr, i64 %0
-;CHECK:  %3 = load i32*, i32** %1, align 8
-;CHECK:  %4 = load i32*, i32** %2, align 8
-;CHECK:  %5 = load i32, i32* %3, align 4
-;CHECK:  %6 = load i32, i32* %4, align 4
-;CHECK:  %7 = insertelement <2 x i32> poison, i32 %5, i32 0
-;CHECK:  %8 = insertelement <2 x i32> %7, i32 %6, i32 1
-;CHECK:  %9 = getelementptr inbounds i32, i32* %A, i64 %index
-;CHECK:  %10 = bitcast i32* %9 to <2 x i32>*
-;CHECK:  store <2 x i32> %8, <2 x i32>* %10, align 4
+; CHECK-LABEL: @foo1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32*, i32** [[PTRPTR:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32*, i32** [[PTRPTR]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP8]], <2 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 undef
+;
 
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll b/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll
index e9a4541c01df1c..8966f7caec98a6 100644
--- a/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll
+++ b/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll
@@ -1,12 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -dce -instcombine -S -force-vector-width=4 < %s 2>%t | FileCheck %s
 
 define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @inv_store_last_lane
-; CHECK: vector.body:
-; CHECK:  store <4 x i32> %[[VEC_VAL:.*]], <
-; CHECK: middle.block:
-; CHECK:  %{{.*}} = extractelement <4 x i32> %[[VEC_VAL]], i32 3
-
+; CHECK-LABEL: @inv_store_last_lane(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP7]], 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[INV:%.*]], i64 42
+; CHECK-NEXT:    store i32 [[MUL_LCSSA]], i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -28,12 +65,46 @@ exit:              ; preds = %for.body
 }
 
 define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @ret_last_lane
-; CHECK: vector.body:
-; CHECK:  store <4 x float> %[[VEC_VAL:.*]], <
-; CHECK: middle.block:
-; CHECK:  %{{.*}} = extractelement <4 x float> %[[VEC_VAL]], i32 3
-
+; CHECK-LABEL: @ret_last_lane(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP7]], 2.000000e+00
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[MUL]], float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[MUL_LCSSA]]
+;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
index 803ee72dea9792..dd1e9889af3811 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s
 ; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s
 
@@ -8,27 +9,72 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 ; contains a conditional store. The store group contains gaps and is not
 ; vectorized.
 ;
-; CHECK-LABEL: @interleaved_with_cond_store_0(
 ;
-; CHECK: vector.ph
-; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
-; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
-; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
-; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
 ;
-; CHECK: vector.body:
-; CHECK:   %wide.vec = load <4 x i64>, <4 x i64>* %{{.*}}
-; CHECK:   %strided.vec = shufflevector <4 x i64> %wide.vec, <4 x i64> poison, <2 x i32> <i32 0, i32 2>
 ;
-; CHECK: pred.store.if
-; CHECK:   %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0
-; CHECK:   store i64 %[[X1]], {{.*}}
 ;
-; CHECK: pred.store.if
-; CHECK:   %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2
-; CHECK:   store i64 %[[X2]], {{.*}}
 
 define void @interleaved_with_cond_store_0(%pair *%p, i64 %x, i64 %n) {
+; CHECK-LABEL: @interleaved_with_cond_store_0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 2, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[PAIR:%.*]], %pair* [[P:%.*]], i64 [[INDEX]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[TMP2]] to <4 x i64>*
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, <4 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i64> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i32 0
+; CHECK-NEXT:    store i64 [[TMP6]], i64* [[TMP2]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i32 2
+; CHECK-NEXT:    store i64 [[TMP10]], i64* [[TMP9]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[IF_MERGE:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[P_1:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[I]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, i64* [[P_1]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[TMP12]], [[X]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[IF_THEN:%.*]], label [[IF_MERGE]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i64 [[TMP12]], i64* [[P_1]], align 8
+; CHECK-NEXT:    br label [[IF_MERGE]]
+; CHECK:       if.merge:
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -57,34 +103,84 @@ for.end:
 ; groups are separately vectorized. The store group contains gaps and is not
 ; vectorized.
 ;
-; CHECK-LABEL: @interleaved_with_cond_store_1(
 ;
-; CHECK: vector.ph
-; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
-; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
-; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
-; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
 ;
-; CHECK: vector.body:
-; CHECK:   %[[L1:.+]] = load <4 x i64>, <4 x i64>* %{{.*}}
-; CHECK:   %strided.vec = shufflevector <4 x i64> %[[L1]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
 ;
-; CHECK: pred.store.if
-; CHECK:   %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0
-; CHECK:   store i64 %[[X1]], {{.*}}
 ;
-; CHECK: pred.store.if
-; CHECK:   %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2
-; CHECK:   store i64 %[[X2]], {{.*}}
 ;
-; CHECK: pred.store.continue
-; CHECK:   %[[L2:.+]] = load <4 x i64>, <4 x i64>* {{.*}}
-; CHECK:   %[[X3:.+]] = extractelement <4 x i64> %[[L2]], i32 0
-; CHECK:   store i64 %[[X3]], {{.*}}
-; CHECK:   %[[X4:.+]] = extractelement <4 x i64> %[[L2]], i32 2
-; CHECK:   store i64 %[[X4]], {{.*}}
 
 define void @interleaved_with_cond_store_1(%pair *%p, i64 %x, i64 %n) {
+; CHECK-LABEL: @interleaved_with_cond_store_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 2, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PAIR:%.*]], %pair* [[P:%.*]], i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[INDEX]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[TMP4]] to <4 x i64>*
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, <4 x i64>* [[TMP6]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <2 x i64> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i32 0
+; CHECK-NEXT:    store i64 [[TMP9]], i64* [[TMP3]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i32 2
+; CHECK-NEXT:    store i64 [[TMP12]], i64* [[TMP11]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i64* [[TMP3]] to <4 x i64>*
+; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i32 0
+; CHECK-NEXT:    store i64 [[TMP14]], i64* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i32 2
+; CHECK-NEXT:    store i64 [[TMP15]], i64* [[TMP5]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[IF_MERGE:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[P_0:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[I]], i32 0
+; CHECK-NEXT:    [[P_1:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[I]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, i64* [[P_1]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], [[X]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[IF_THEN:%.*]], label [[IF_MERGE]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i64 [[TMP17]], i64* [[P_0]], align 8
+; CHECK-NEXT:    br label [[IF_MERGE]]
+; CHECK:       if.merge:
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, i64* [[P_0]], align 8
+; CHECK-NEXT:    store i64 [[TMP19]], i64* [[P_1]], align 8
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -116,29 +212,78 @@ for.end:
 ; predicated block. The load group is vectorized, and the store groups contain
 ; gaps and are not vectorized.
 ;
-; CHECK-LABEL: @interleaved_with_cond_store_2(
 ;
-; CHECK: vector.ph
-; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
-; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
-; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
-; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
 ;
-; CHECK: vector.body:
-; CHECK:   %[[L1:.+]] = load <4 x i64>, <4 x i64>* %{{.*}}
-; CHECK:   %strided.vec = shufflevector <4 x i64> %[[L1]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; CHECK:   store i64 %x, {{.*}}
-; CHECK:   store i64 %x, {{.*}}
 ;
-; CHECK: pred.store.if
-; CHECK:   %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0
-; CHECK:   store i64 %[[X1]], {{.*}}
 ;
-; CHECK: pred.store.if
-; CHECK:   %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2
-; CHECK:   store i64 %[[X2]], {{.*}}
 
 define void @interleaved_with_cond_store_2(%pair *%p, i64 %x, i64 %n) {
+; CHECK-LABEL: @interleaved_with_cond_store_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 2, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PAIR:%.*]], %pair* [[P:%.*]], i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[INDEX]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <4 x i64>*
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, <4 x i64>* [[TMP6]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    store i64 [[X]], i64* [[TMP3]], align 8
+; CHECK-NEXT:    store i64 [[X]], i64* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <2 x i64> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i32 0
+; CHECK-NEXT:    store i64 [[TMP9]], i64* [[TMP5]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i32 2
+; CHECK-NEXT:    store i64 [[TMP12]], i64* [[TMP11]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[IF_MERGE:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[P_0:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[I]], i32 0
+; CHECK-NEXT:    [[P_1:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[I]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, i64* [[P_1]], align 8
+; CHECK-NEXT:    store i64 [[X]], i64* [[P_0]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[TMP14]], [[X]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[IF_THEN:%.*]], label [[IF_MERGE]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i64 [[TMP14]], i64* [[P_1]], align 8
+; CHECK-NEXT:    br label [[IF_MERGE]]
+; CHECK:       if.merge:
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index fd1e909066844b..0688808104c7fb 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -licm -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
 
 ; First licm pass is to hoist/sink invariant stores if possible. Today LICM does
@@ -12,23 +13,64 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; address.
 
 
-; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction(
 ; memory check is found.conflict = b[max(n-1,1)] > a && (i8* a)+1 > (i8* b)
-; CHECK: vector.memcheck:
-; CHECK:    found.conflict
-
-; CHECK-LABEL: vector.body:
-; CHECK:         %vec.phi = phi <4 x i32>  [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ]
-; CHECK:         %wide.load = load <4 x i32>
-; CHECK:         [[ADD]] = add <4 x i32> %vec.phi, %wide.load
-; CHECK-NEXT:    store i32 %ntrunc, i32* %a
-; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT:    br i1
-
-; CHECK-LABEL: middle.block:
-; CHECK:         call i32 @llvm.vector.reduce.add.v4i32(<4 x i32>
+
+
 define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
+; CHECK-LABEL: @inv_val_store_to_inv_address_with_reduction(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP2]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[I0:%.*]] = phi i32 [ [[I3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[I2:%.*]] = load i32, i32* [[I1]], align 8
+; CHECK-NEXT:    [[I3]] = add i32 [[I0]], [[I2]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    [[I3_LCSSA:%.*]] = phi i32 [ [[I3]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[I4:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ [[I3_LCSSA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[I4]]
+;
 entry:
   %ntrunc = trunc i64 %n to i32
   br label %for.body
@@ -49,14 +91,54 @@ for.end:                                          ; preds = %for.body
   ret i32 %i4
 }
 
-; CHECK-LABEL: inv_val_store_to_inv_address(
-; CHECK-LABEL: vector.body:
-; CHECK:         store i32 %ntrunc, i32* %a
-; CHECK:         store <4 x i32>
-; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    icmp eq i64 %index.next, %n.vec
-; CHECK-NEXT:    br i1
 define void @inv_val_store_to_inv_address(i32* %a, i64 %n, i32* %b) {
+; CHECK-LABEL: @inv_val_store_to_inv_address(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !8, !noalias !11
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP1]], align 4, !alias.scope !11
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[I1]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   %ntrunc = trunc i64 %n to i32
   br label %for.body
@@ -83,21 +165,90 @@ for.end:                                          ; preds = %for.body
 ; TODO: We can be better with the code gen for the first test and we can have
 ; just one scalar store if vector.or.reduce(vector_cmp(b[i] == k)) is 1.
 
-; CHECK-LABEL:inv_val_store_to_inv_address_conditional(
-; CHECK-LABEL: vector.body:
-; CHECK:           %wide.load = load <4 x i32>, <4 x i32>*
-; CHECK:           [[CMP:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, %{{.*}}
-; CHECK:           store <4 x i32>
-; CHECK-NEXT:      [[EE:%[a-zA-Z0-9.]+]] =  extractelement <4 x i1> [[CMP]], i32 0
-; CHECK-NEXT:      br i1 [[EE]], label %pred.store.if, label %pred.store.continue
 
-; CHECK-LABEL: pred.store.if:
-; CHECK-NEXT:      store i32 %ntrunc, i32* %a
-; CHECK-NEXT:      br label %pred.store.continue
 
-; CHECK-LABEL: pred.store.continue:
-; CHECK-NEXT:      [[EE1:%[a-zA-Z0-9.]+]] =  extractelement <4 x i1> [[CMP]], i32 1
 define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
+; CHECK-LABEL: @inv_val_store_to_inv_address_conditional(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]]
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[B]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32>* [[TMP3]], align 4, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
+; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK:       pred.store.if9:
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; CHECK:       pred.store.continue10:
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK:       pred.store.if11:
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; CHECK:       pred.store.continue12:
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
+; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]]
+; CHECK:       pred.store.if13:
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
+; CHECK:       pred.store.continue14:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[I2:%.*]] = load i32, i32* [[I1]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I2]], [[K]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[I1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   %ntrunc = trunc i64 %n to i32
   br label %for.body
@@ -128,9 +279,31 @@ for.end:                                          ; preds = %for.body
 ; else a = k;
 ; TODO: We could vectorize this once we support multiple uniform stores to the
 ; same address.
-; CHECK-LABEL:inv_val_store_to_inv_address_conditional_diff_values(
-; CHECK-NOT:           load <4 x i32>
 define void @inv_val_store_to_inv_address_conditional_diff_values(i32* %a, i64 %n, i32* %b, i32 %k) {
+; CHECK-LABEL: @inv_val_store_to_inv_address_conditional_diff_values(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[I2:%.*]] = load i32, i32* [[I1]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I2]], [[K:%.*]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[I1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       cond_store_k:
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i32 [ [[K]], [[COND_STORE_K]] ], [ [[NTRUNC]], [[COND_STORE]] ]
+; CHECK-NEXT:    store i32 [[STOREMERGE]], i32* [[A:%.*]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   %ntrunc = trunc i64 %n to i32
   br label %for.body
@@ -169,9 +342,47 @@ for.end:                                          ; preds = %for.body
 ;    }
 ;  }
 
-; CHECK-LABEL: multiple_uniform_stores
-; CHECK-NOT:     <4 x i32>
 define i32 @multiple_uniform_stores(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
+; CHECK-LABEL: @multiple_uniform_stores(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP20]], label [[FOR_END10:%.*]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader.preheader:
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[INDVARS_IV23:%.*]] = phi i64 [ [[INDVARS_IV_NEXT24:%.*]], [[FOR_INC8:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[J_022:%.*]] = phi i32 [ [[J_1_LCSSA:%.*]], [[FOR_INC8]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP218:%.*]] = icmp ult i32 [[J_022]], [[ITR]]
+; CHECK-NEXT:    br i1 [[CMP218]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_INC8]]
+; CHECK:       for.body3.lr.ph:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[VAR1:%.*]], i64 [[INDVARS_IV23]]
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[J_022]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
+; CHECK:       for.body3:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY3_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[VAR2:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i32 [[ADD]], 1
+; CHECK-NEXT:    store i32 [[TMP3]], i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[ITR]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_INC8_LOOPEXIT:%.*]], label [[FOR_BODY3]]
+; CHECK:       for.inc8.loopexit:
+; CHECK-NEXT:    br label [[FOR_INC8]]
+; CHECK:       for.inc8:
+; CHECK-NEXT:    [[J_1_LCSSA]] = phi i32 [ [[J_022]], [[FOR_COND1_PREHEADER]] ], [ [[ITR]], [[FOR_INC8_LOOPEXIT]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT24]] = add nuw nsw i64 [[INDVARS_IV23]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV25:%.*]] = trunc i64 [[INDVARS_IV_NEXT24]] to i32
+; CHECK-NEXT:    [[EXITCOND26:%.*]] = icmp eq i32 [[LFTR_WIDEIV25]], [[ITR]]
+; CHECK-NEXT:    br i1 [[EXITCOND26]], label [[FOR_END10_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK:       for.end10.loopexit:
+; CHECK-NEXT:    br label [[FOR_END10]]
+; CHECK:       for.end10:
+; CHECK-NEXT:    ret i32 undef
+;
 entry:
   %cmp20 = icmp eq i32 %itr, 0
   br i1 %cmp20, label %for.end10, label %for.cond1.preheader
@@ -215,9 +426,53 @@ for.end10:                                        ; preds = %for.inc8, %entry
 
 ; second uniform store to the same address is conditional.
 ; we do not vectorize this.
-; CHECK-LABEL: multiple_uniform_stores_conditional
-; CHECK-NOT:    <4 x i32>
 define i32 @multiple_uniform_stores_conditional(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
+; CHECK-LABEL: @multiple_uniform_stores_conditional(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP20]], label [[FOR_END10:%.*]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader.preheader:
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[INDVARS_IV23:%.*]] = phi i64 [ [[INDVARS_IV_NEXT24:%.*]], [[FOR_INC8:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[J_022:%.*]] = phi i32 [ [[J_1_LCSSA:%.*]], [[FOR_INC8]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP218:%.*]] = icmp ult i32 [[J_022]], [[ITR]]
+; CHECK-NEXT:    br i1 [[CMP218]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_INC8]]
+; CHECK:       for.body3.lr.ph:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[VAR1:%.*]], i64 [[INDVARS_IV23]]
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[J_022]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
+; CHECK:       for.body3:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY3_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LATCH:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[VAR2:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i32 [[ADD]], 42
+; CHECK-NEXT:    br i1 [[TMP3]], label [[COND_STORE:%.*]], label [[LATCH]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i32 [[ADD]], 1
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i32 [ [[TMP4]], [[COND_STORE]] ], [ [[ADD]], [[FOR_BODY3]] ]
+; CHECK-NEXT:    store i32 [[STOREMERGE]], i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[ITR]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_INC8_LOOPEXIT:%.*]], label [[FOR_BODY3]]
+; CHECK:       for.inc8.loopexit:
+; CHECK-NEXT:    br label [[FOR_INC8]]
+; CHECK:       for.inc8:
+; CHECK-NEXT:    [[J_1_LCSSA]] = phi i32 [ [[J_022]], [[FOR_COND1_PREHEADER]] ], [ [[ITR]], [[FOR_INC8_LOOPEXIT]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT24]] = add nuw nsw i64 [[INDVARS_IV23]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV25:%.*]] = trunc i64 [[INDVARS_IV_NEXT24]] to i32
+; CHECK-NEXT:    [[EXITCOND26:%.*]] = icmp eq i32 [[LFTR_WIDEIV25]], [[ITR]]
+; CHECK-NEXT:    br i1 [[EXITCOND26]], label [[FOR_END10_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK:       for.end10.loopexit:
+; CHECK-NEXT:    br label [[FOR_END10]]
+; CHECK:       for.end10:
+; CHECK-NEXT:    ret i32 undef
+;
 entry:
   %cmp20 = icmp eq i32 %itr, 0
   br i1 %cmp20, label %for.end10, label %for.cond1.preheader
@@ -272,8 +527,35 @@ for.end10:                                        ; preds = %for.inc8, %entry
 ; Note: %i10 could be replaced by phi(%arg4, %i12), a potentially vectorizable
 ; 1st-order-recurrence
 define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, i16* %arg3, i32 %arg4, i64 %arg5) {
-; CHECK-LABEL: unsafe_dep_uniform_load_store
-; CHECK-NOT: <4 x i32>
+; CHECK-LABEL: @unsafe_dep_uniform_load_store(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I6:%.*]] = getelementptr inbounds i16, i16* [[ARG3:%.*]], i64 [[ARG5:%.*]]
+; CHECK-NEXT:    br label [[BB7:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    [[I121:%.*]] = phi i32 [ [[ARG4:%.*]], [[BB:%.*]] ], [ [[I12:%.*]], [[BB7]] ]
+; CHECK-NEXT:    [[I8:%.*]] = phi i64 [ 0, [[BB]] ], [ [[I24:%.*]], [[BB7]] ]
+; CHECK-NEXT:    [[I9:%.*]] = phi i32 [ [[ARG1:%.*]], [[BB]] ], [ [[I23:%.*]], [[BB7]] ]
+; CHECK-NEXT:    [[I11:%.*]] = mul nsw i32 [[I9]], [[I121]]
+; CHECK-NEXT:    [[I12]] = srem i32 [[I11]], 65536
+; CHECK-NEXT:    [[I13:%.*]] = add nsw i32 [[I12]], [[I9]]
+; CHECK-NEXT:    [[I14:%.*]] = trunc i32 [[I13]] to i16
+; CHECK-NEXT:    [[I15:%.*]] = trunc i64 [[I8]] to i32
+; CHECK-NEXT:    [[I16:%.*]] = add i32 [[I15]], [[ARG:%.*]]
+; CHECK-NEXT:    [[I17:%.*]] = zext i32 [[I16]] to i64
+; CHECK-NEXT:    [[I18:%.*]] = getelementptr inbounds i16, i16* [[I6]], i64 [[I17]]
+; CHECK-NEXT:    store i16 [[I14]], i16* [[I18]], align 2
+; CHECK-NEXT:    [[I19:%.*]] = add i32 [[I13]], [[I9]]
+; CHECK-NEXT:    [[I20:%.*]] = trunc i32 [[I19]] to i16
+; CHECK-NEXT:    [[I21:%.*]] = and i16 [[I20]], 255
+; CHECK-NEXT:    [[I22:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i64 [[I17]]
+; CHECK-NEXT:    store i16 [[I21]], i16* [[I22]], align 2
+; CHECK-NEXT:    [[I23]] = add nsw i32 [[I9]], 1
+; CHECK-NEXT:    [[I24]] = add nuw nsw i64 [[I8]], 1
+; CHECK-NEXT:    [[I25:%.*]] = icmp eq i64 [[I24]], [[ARG2:%.*]]
+; CHECK-NEXT:    br i1 [[I25]], label [[BB26:%.*]], label [[BB7]]
+; CHECK:       bb26:
+; CHECK-NEXT:    ret void
+;
 bb:
   %i = alloca i32
   store i32 %arg4, i32* %i
@@ -309,4 +591,3 @@ bb26:
 }
 
 ; Make sure any check-not directives are not triggered by function declarations.
-; CHECK: declare
diff --git a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
index d1c38b6ce4098c..ec4a3397593349 100644
--- a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
+++ b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
@@ -1,31 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
 ; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine -enable-new-pm=0 2>&1 | FileCheck %s
 ; RUN: opt < %s -aa-pipeline=basic-aa -passes=loop-vectorize,instcombine -force-vector-width=2 -force-vector-interleave=1 -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
-; CHECK-LABEL: vector_gep
-; CHECK-NOT:   LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+;
+define void @vector_gep(i32** %a, i32 *%b, i64 %n) {
+; CHECK-LABEL: @vector_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, <2 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <2 x i32*>*
-; CHECK-NEXT:    store <2 x i32*> [[TMP1]], <2 x i32*>* [[TMP3]], align 8
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <2 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32*, i32** [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32** [[TMP1]] to <2 x i32*>*
+; CHECK-NEXT:    store <2 x i32*> [[TMP0]], <2 x i32*>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[VAR1:%.*]] = getelementptr inbounds i32*, i32** [[A]], i64 [[I]]
+; CHECK-NEXT:    store i32* [[VAR0]], i32** [[VAR1]], align 8
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
 ;
-define void @vector_gep(i32** %a, i32 *%b, i64 %n) {
 entry:
   br label %for.body
 
 for.body:
   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-  %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
-  %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
-  store i32* %tmp0, i32** %tmp1, align 8
+  %var0 = getelementptr inbounds i32, i32* %b, i64 %i
+  %var1 = getelementptr inbounds i32*, i32** %a, i64 %i
+  store i32* %var0, i32** %var1, align 8
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -34,33 +59,58 @@ for.end:
   ret void
 }
 
-; CHECK-LABEL: scalar_store
-; CHECK:       LV: Found scalar instruction: %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
-; CHECK-NEXT:  LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
-; CHECK-NEXT:  LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-; CHECK-NEXT:  LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
+;
+define void @scalar_store(i32** %a, i32 *%b, i64 %n) {
+; CHECK-LABEL: @scalar_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 2)
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775806
+; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* %b, i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* %b, i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32*, i32** [[A:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32*, i32** [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    store i32* [[TMP4]], i32** [[TMP6]], align 8
 ; CHECK-NEXT:    store i32* [[TMP5]], i32** [[TMP7]], align 8
-; CHECK-NEXT:    store i32* [[TMP6]], i32** [[TMP8]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[VAR1:%.*]] = getelementptr inbounds i32*, i32** [[A]], i64 [[I]]
+; CHECK-NEXT:    store i32* [[VAR0]], i32** [[VAR1]], align 8
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 2
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
 ;
-define void @scalar_store(i32** %a, i32 *%b, i64 %n) {
 entry:
   br label %for.body
 
 for.body:
   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-  %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
-  %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
-  store i32* %tmp0, i32** %tmp1, align 8
+  %var0 = getelementptr inbounds i32, i32* %b, i64 %i
+  %var1 = getelementptr inbounds i32*, i32** %a, i64 %i
+  store i32* %var0, i32** %var1, align 8
   %i.next = add nuw nsw i64 %i, 2
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -69,39 +119,63 @@ for.end:
   ret void
 }
 
-; CHECK-LABEL: expansion
-; CHECK:       LV: Found scalar instruction: %tmp3 = getelementptr inbounds i32*, i32** %tmp2, i64 %i
-; CHECK-NEXT:  LV: Found scalar instruction: %tmp1 = bitcast i64* %tmp0 to i32*
-; CHECK-NEXT:  LV: Found scalar instruction: %tmp2 = getelementptr inbounds i32*, i32** %a, i64 0
-; CHECK-NEXT:  LV: Found scalar instruction: %tmp0 = getelementptr inbounds i64, i64* %b, i64 %i
-; CHECK-NEXT:  LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-; CHECK-NEXT:  LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
+;
+define void @expansion(i32** %a, i64 *%b, i64 %n) {
+; CHECK-LABEL: @expansion(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 2)
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775806
+; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, i64* %b, i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, i64* %b, i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32*, i32** [[A:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32*, i32** [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32** [[TMP6]] to i64**
+; CHECK-NEXT:    store i64* [[TMP4]], i64** [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32** [[TMP7]] to i64**
 ; CHECK-NEXT:    store i64* [[TMP5]], i64** [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32** [[TMP8]] to i64**
-; CHECK-NEXT:    store i64* [[TMP6]], i64** [[TMP10]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[VAR3:%.*]] = getelementptr inbounds i32*, i32** [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32** [[VAR3]] to i64**
+; CHECK-NEXT:    store i64* [[VAR0]], i64** [[TMP11]], align 8
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 2
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
 ;
-define void @expansion(i32** %a, i64 *%b, i64 %n) {
 entry:
   br label %for.body
 
 for.body:
   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-  %tmp0 = getelementptr inbounds i64, i64* %b, i64 %i
-  %tmp1 = bitcast i64* %tmp0 to i32*
-  %tmp2 = getelementptr inbounds i32*, i32** %a, i64 0
-  %tmp3 = getelementptr inbounds i32*, i32** %tmp2, i64 %i
-  store i32* %tmp1, i32** %tmp3, align 8
+  %var0 = getelementptr inbounds i64, i64* %b, i64 %i
+  %var1 = bitcast i64* %var0 to i32*
+  %var2 = getelementptr inbounds i32*, i32** %a, i64 0
+  %var3 = getelementptr inbounds i32*, i32** %var2, i64 %i
+  store i32* %var1, i32** %var3, align 8
   %i.next = add nuw nsw i64 %i, 2
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -110,31 +184,53 @@ for.end:
   ret void
 }
 
-; CHECK-LABEL: no_gep_or_bitcast
-; CHECK-NOT:   LV: Found scalar instruction: %tmp1 = load i32*, i32** %tmp0, align 8
-; CHECK:       LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-; CHECK-NEXT:  LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 1
+;
+define void @no_gep_or_bitcast(i32** noalias %a, i64 %n) {
+; CHECK-LABEL: @no_gep_or_bitcast(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32** [[TMP1]] to <2 x i32*>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32*, i32** [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32** [[TMP0]] to <2 x i32*>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    store i32 0, i32* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 1
 ; CHECK-NEXT:    store i32 0, i32* [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    store i32 0, i32* [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32*, i32** [[A]], i64 [[I]]
+; CHECK-NEXT:    [[VAR1:%.*]] = load i32*, i32** [[VAR0]], align 8
+; CHECK-NEXT:    store i32 0, i32* [[VAR1]], align 8
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
 ;
-define void @no_gep_or_bitcast(i32** noalias %a, i64 %n) {
 entry:
   br label %for.body
 
 for.body:
   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-  %tmp0 = getelementptr inbounds i32*, i32** %a, i64 %i
-  %tmp1 = load i32*, i32** %tmp0, align 8
-  store i32 0, i32* %tmp1, align 8
+  %var0 = getelementptr inbounds i32*, i32** %a, i64 %i
+  %var1 = load i32*, i32** %var0, align 8
+  store i32 0, i32* %var1, align 8
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
index 0ded0899078fbb..0f4e4d0988fa52 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -scalable-vectorization=on -force-target-instruction-cost=1 -force-target-supports-scalable-vectors -dce -instcombine < %s -S | FileCheck %s
 
 ; Test that we can add on the induction variable
@@ -9,25 +10,72 @@
 define void @add_ind64_unrolled(i64* noalias nocapture %a, i64* noalias nocapture readonly %b, i64 %n) {
 ; CHECK-LABEL: @add_ind64_unrolled(
 ; CHECK-NEXT:  entry:
-; CHECK: vector.body:
-; CHECK-NEXT:  %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ]
-; CHECK-NEXT:  %[[STEPVEC:.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; CHECK-NEXT:  %[[TMP1:.*]] = insertelement <vscale x 2 x i64> poison, i64 %[[INDEX]], i32 0
-; CHECK-NEXT:  %[[IDXSPLT:.*]] = shufflevector <vscale x 2 x i64> %[[TMP1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:  %[[VECIND1:.*]] = add <vscale x 2 x i64> %[[IDXSPLT]], %[[STEPVEC]]
-; CHECK-NEXT:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:  %[[EC:.*]] = shl i64 %[[VSCALE]], 1
-; CHECK-NEXT:  %[[TMP2:.*]] = insertelement <vscale x 2 x i64> poison, i64 %[[EC]], i32 0
-; CHECK-NEXT:  %[[ECSPLT:.*]] = shufflevector <vscale x 2 x i64> %[[TMP2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:  %[[TMP3:.*]] = add <vscale x 2 x i64> %[[ECSPLT]], %[[STEPVEC]]
-; CHECK-NEXT:  %[[VECIND2:.*]] = add <vscale x 2 x i64> %[[IDXSPLT]], %[[TMP3]]
-; CHECK:       %[[LOAD1:.*]] = load <vscale x 2 x i64>
-; CHECK:       %[[LOAD2:.*]] = load <vscale x 2 x i64>
-; CHECK:       %[[STOREVAL1:.*]] = add nsw <vscale x 2 x i64> %[[LOAD1]], %[[VECIND1]]
-; CHECK:       %[[STOREVAL2:.*]] = add nsw <vscale x 2 x i64> %[[LOAD2]], %[[VECIND2]]
-; CHECK:       store <vscale x 2 x i64> %[[STOREVAL1]]
-; CHECK:       store <vscale x 2 x i64> %[[STOREVAL2]]
-
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[DOTSPLATINSERT4:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP7]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT4]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64* [[TMP10]] to <vscale x 2 x i64>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, <vscale x 2 x i64>* [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP13:%.*]] = shl i32 [[TMP12]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64* [[TMP15]] to <vscale x 2 x i64>*
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 2 x i64>, <vscale x 2 x i64>* [[TMP16]], align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP5]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 2 x i64> [[WIDE_LOAD6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i64* [[TMP19]] to <vscale x 2 x i64>*
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP17]], <vscale x 2 x i64>* [[TMP20]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP22:%.*]] = shl i32 [[TMP21]], 1
+; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, i64* [[TMP19]], i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i64* [[TMP24]] to <vscale x 2 x i64>*
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP18]], <vscale x 2 x i64>* [[TMP25]], align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP27:%.*]] = shl i64 [[TMP26]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP27]]
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP29]], [[I_08]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I_08]]
+; CHECK-NEXT:    store i64 [[ADD]], i64* [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -52,24 +100,69 @@ exit:                                 ; preds = %for.body
 define void @add_ind64_unrolled_nxv1i64(i64* noalias nocapture %a, i64* noalias nocapture readonly %b, i64 %n) {
 ; CHECK-LABEL: @add_ind64_unrolled_nxv1i64(
 ; CHECK-NEXT:  entry:
-; CHECK: vector.body:
-; CHECK-NEXT:  %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ]
-; CHECK-NEXT:  %[[STEPVEC:.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
-; CHECK-NEXT:  %[[TMP1:.*]] = insertelement <vscale x 1 x i64> poison, i64 %[[INDEX]], i32 0
-; CHECK-NEXT:  %[[IDXSPLT:.*]] = shufflevector <vscale x 1 x i64> %[[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; CHECK-NEXT:  %[[VECIND1:.*]] = add <vscale x 1 x i64> %[[IDXSPLT]], %[[STEPVEC]]
-; CHECK-NEXT:  %[[EC:.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:  %[[TMP2:.*]] = insertelement <vscale x 1 x i64> poison, i64 %[[EC]], i32 0
-; CHECK-NEXT:  %[[ECSPLT:.*]] = shufflevector <vscale x 1 x i64> %[[TMP2]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; CHECK-NEXT:  %[[TMP3:.*]] = add <vscale x 1 x i64> %[[ECSPLT]], %[[STEPVEC]]
-; CHECK-NEXT:  %[[VECIND2:.*]] = add <vscale x 1 x i64> %[[IDXSPLT]], %[[TMP3]]
-; CHECK:       %[[LOAD1:.*]] = load <vscale x 1 x i64>
-; CHECK:       %[[LOAD2:.*]] = load <vscale x 1 x i64>
-; CHECK:       %[[STOREVAL1:.*]] = add nsw <vscale x 1 x i64> %[[LOAD1]], %[[VECIND1]]
-; CHECK:       %[[STOREVAL2:.*]] = add nsw <vscale x 1 x i64> %[[LOAD2]], %[[VECIND2]]
-; CHECK:       store <vscale x 1 x i64> %[[STOREVAL1]]
-; CHECK:       store <vscale x 1 x i64> %[[STOREVAL2]]
-
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT2]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 1 x i64> [[DOTSPLAT3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT4:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT4]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 1 x i64> [[DOTSPLAT5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 1 x i64> [[DOTSPLAT3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[TMP9]] to <vscale x 1 x i64>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, <vscale x 1 x i64>* [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[TMP9]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64* [[TMP13]] to <vscale x 1 x i64>*
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 1 x i64>, <vscale x 1 x i64>* [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <vscale x 1 x i64> [[WIDE_LOAD]], [[TMP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add nsw <vscale x 1 x i64> [[WIDE_LOAD6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64* [[TMP17]] to <vscale x 1 x i64>*
+; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64>* [[TMP18]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, i64* [[TMP17]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i64* [[TMP21]] to <vscale x 1 x i64>*
+; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64>* [[TMP22]], align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP24:%.*]] = shl i64 [[TMP23]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP24]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP26]], [[I_08]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I_08]]
+; CHECK-NEXT:    store i64 [[ADD]], i64* [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -100,17 +193,56 @@ exit:                                 ; preds = %for.body
 
 define void @add_unique_ind32(i32* noalias nocapture %a, i64 %n) {
 ; CHECK-LABEL: @add_unique_ind32(
-; CHECK:    vector.ph:
-; CHECK:      %[[STEPVEC:.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-; CHECK-NEXT: %[[INDINIT:.*]] = shl <vscale x 4 x i32> %[[STEPVEC]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: %[[INC:.*]] = shl i32 %[[VSCALE]], 3
-; CHECK-NEXT: %[[TMP:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[INC]], i32 0
-; CHECK-NEXT: %[[VECINC:.*]] = shufflevector <vscale x 4 x i32> %[[TMP]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK:    vector.body:
-; CHECK:      %[[VECIND:.*]] = phi <vscale x 4 x i32> [ %[[INDINIT]], %vector.ph ], [ %[[VECINDNXT:.*]], %vector.body ]
-; CHECK:      store <vscale x 4 x i32> %[[VECIND]]
-; CHECK:      %[[VECINDNXT]] = add <vscale x 4 x i32> %[[VECIND]], %[[VECINC]]
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[CAST_CRD]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl <vscale x 4 x i32> [[TMP4]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i32 [[TMP6]], 3
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store <vscale x 4 x i32> [[VEC_IND]], <vscale x 4 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = shl i64 [[TMP10]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_08]]
+; CHECK-NEXT:    store i32 [[R_07]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nuw nsw i32 [[R_07]], 2
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -139,22 +271,61 @@ exit:                                 ; preds = %for.body
 
 define void @add_unique_indf32(float* noalias nocapture %a, i64 %n) {
 ; CHECK-LABEL: @add_unique_indf32(
-; CHECK:    vector.ph:
-; CHECK:      %[[STEPVEC:.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-; CHECK-NEXT: %[[TMP1:.*]] = uitofp <vscale x 4 x i32> %[[STEPVEC]] to <vscale x 4 x float>
-; CHECK-NEXT: %[[TMP2:.*]] = fmul <vscale x 4 x float> %[[TMP1]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: %[[INDINIT:.*]] = fadd <vscale x 4 x float> %[[TMP2]], zeroinitializer
-; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: %[[TMP3:.*]] = shl i32 %8, 2
-; CHECK-NEXT: %[[TMP4:.*]] = uitofp i32 %[[TMP3]] to float
-; CHECK-NEXT: %[[INC:.*]] = fmul float %[[TMP4]], 2.000000e+00
-; CHECK-NEXT: %[[TMP5:.*]] = insertelement <vscale x 4 x float> poison, float %[[INC]], i32 0
-; CHECK-NEXT: %[[VECINC:.*]] = shufflevector <vscale x 4 x float> %[[TMP5]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK:   vector.body:
-; CHECK:     %[[VECIND:.*]] = phi <vscale x 4 x float> [ %[[INDINIT]], %vector.ph ], [ %[[VECINDNXT:.*]], %vector.body ]
-; CHECK:     store <vscale x 4 x float> %[[VECIND]]
-; CHECK:     %[[VECINDNXT]] = fadd <vscale x 4 x float> %[[VECIND]], %[[VECINC]]
-
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul float [[CAST_CRD]], 2.000000e+00
+; CHECK-NEXT:    [[IND_END:%.*]] = fadd float [[TMP4]], 0.000000e+00
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK-NEXT:    [[TMP6:%.*]] = uitofp <vscale x 4 x i32> [[TMP5]] to <vscale x 4 x float>
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <vscale x 4 x float> [[TMP6]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = fadd <vscale x 4 x float> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP9:%.*]] = shl i32 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul float [[TMP10]], 2.000000e+00
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[TMP11]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x float> [[DOTSPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 4 x float>*
+; CHECK-NEXT:    store <vscale x 4 x float> [[VEC_IND]], <vscale x 4 x float>* [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = shl i64 [[TMP14]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = fadd <vscale x 4 x float> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi float [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[R_07:%.*]] = phi float [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[I_08]]
+; CHECK-NEXT:    store float [[R_07]], float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = fadd float [[R_07]], 2.000000e+00
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/vector-geps.ll b/llvm/test/Transforms/LoopVectorize/vector-geps.ll
index beed0542db9362..e371b98e06fc9e 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-geps.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-geps.ll
@@ -1,28 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
+;
+define void @vector_gep_stored(i32** %a, i32 *%b, i64 %n) {
 ; CHECK-LABEL: @vector_gep_stored(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <4 x i32*>*
-; CHECK-NEXT:    store <4 x i32*> [[TMP1]], <4 x i32*>* [[TMP3]], align 8
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32*, i32** [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32** [[TMP1]] to <4 x i32*>*
+; CHECK-NEXT:    store <4 x i32*> [[TMP0]], <4 x i32*>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[VAR1:%.*]] = getelementptr inbounds i32*, i32** [[A]], i64 [[I]]
+; CHECK-NEXT:    store i32* [[VAR0]], i32** [[VAR1]], align 8
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
 ;
-define void @vector_gep_stored(i32** %a, i32 *%b, i64 %n) {
 entry:
   br label %for.body
 
 for.body:
   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-  %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
-  %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
-  store i32* %tmp0, i32** %tmp1, align 8
+  %var0 = getelementptr inbounds i32, i32* %b, i64 %i
+  %var1 = getelementptr inbounds i32*, i32** %a, i64 %i
+  store i32* %var0, i32** %var1, align 8
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end
@@ -31,27 +57,52 @@ for.end:
   ret void
 }
 
+;
+define void @uniform_vector_gep_stored(i32** %a, i32 *%b, i64 %n) {
 ; CHECK-LABEL: @uniform_vector_gep_stored(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, i64 1
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP0]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32*> [[DOTSPLATINSERT]], <4 x i32*> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <4 x i32*>*
-; CHECK-NEXT:    store <4 x i32*> [[DOTSPLAT]], <4 x i32*>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32*, i32** [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32** [[TMP1]] to <4 x i32*>*
+; CHECK-NEXT:    store <4 x i32*> [[DOTSPLAT]], <4 x i32*>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1
+; CHECK-NEXT:    [[VAR1:%.*]] = getelementptr inbounds i32*, i32** [[A]], i64 [[I]]
+; CHECK-NEXT:    store i32* [[VAR0]], i32** [[VAR1]], align 8
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
 ;
-define void @uniform_vector_gep_stored(i32** %a, i32 *%b, i64 %n) {
 entry:
   br label %for.body
 
 for.body:
   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-  %tmp0 = getelementptr inbounds i32, i32* %b, i64 1
-  %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
-  store i32* %tmp0, i32** %tmp1, align 8
+  %var0 = getelementptr inbounds i32, i32* %b, i64 1
+  %var1 = getelementptr inbounds i32*, i32** %a, i64 %i
+  store i32* %var0, i32** %var1, align 8
   %i.next = add nuw nsw i64 %i, 1
   %cond = icmp slt i64 %i.next, %n
   br i1 %cond, label %for.body, label %for.end