diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll index 668946f9eda44d..d6d5275cc5fc98 100644 --- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll @@ -1,11 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -instcombine < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" define @dup_insertelement_0( %v, i8 %s) #0 { ; CHECK-LABEL: @dup_insertelement_0( -; CHECK: %insert = insertelement %v, i8 %s, i64 0 -; CHECK-NEXT: ret %insert +; CHECK-NEXT: [[INSERT:%.*]] = insertelement [[V:%.*]], i8 [[S:%.*]], i64 0 +; CHECK-NEXT: ret [[INSERT]] +; %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 1) %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) ret %insert @@ -13,9 +15,10 @@ define @dup_insertelement_0( %v, i8 %s) #0 define @dup_insertelement_1( %v, i8 %s) #0 { ; CHECK-LABEL: @dup_insertelement_1( -; CHECK: %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) -; CHECK-NEXT: %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) -; CHECK-NEXT: ret %insert +; CHECK-NEXT: [[PG:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) +; CHECK-NEXT: [[INSERT:%.*]] = tail call @llvm.aarch64.sve.dup.nxv16i8( [[V:%.*]], [[PG]], i8 [[S:%.*]]) +; CHECK-NEXT: ret [[INSERT]] +; %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) ret %insert @@ -23,16 +26,18 @@ define @dup_insertelement_1( %v, i8 %s) #0 define @dup_insertelement_x( %v, i8 %s, %pg) #0 { ; CHECK-LABEL: @dup_insertelement_x( -; CHECK: %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) -; CHECK-NEXT: ret %insert +; CHECK-NEXT: [[INSERT:%.*]] = tail call @llvm.aarch64.sve.dup.nxv16i8( [[V:%.*]], [[PG:%.*]], i8 [[S:%.*]]) +; CHECK-NEXT: ret [[INSERT]] +; %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) ret %insert } define @dup_insertelement_0_convert( %v, i16 %s) #0 { ; CHECK-LABEL: @dup_insertelement_0_convert( -; CHECK: %insert = insertelement %v, i16 %s, i64 0 -; CHECK-NEXT: ret %insert +; CHECK-NEXT: [[INSERT:%.*]] = insertelement [[V:%.*]], i16 [[S:%.*]], i64 0 +; CHECK-NEXT: ret [[INSERT]] +; %pg = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 1) %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg) %2 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %1) @@ -42,9 +47,10 @@ define @dup_insertelement_0_convert( %v, i1 define @dupx_splat_convert(i16 %s) #0 { ; CHECK-LABEL: @dupx_splat_convert( -; CHECK-NEXT: %.splatinsert = insertelement poison, i16 %s, i32 0 -; CHECK-NEXT: %splat = shufflevector %.splatinsert, poison, zeroinitializer -; CHECK-NEXT: ret %splat +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i16 [[S:%.*]], i32 0 +; CHECK-NEXT: [[SPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: ret [[SPLAT]] +; %splat = tail call @llvm.aarch64.sve.dup.x.nxv8i16(i16 %s) ret %splat } diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-tbl-dupx.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-tbl-dupx.ll index e46879fe645e88..1be1a680ff5331 100644 --- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-tbl-dupx.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-tbl-dupx.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -instcombine < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -5,11 +6,12 @@ target triple = "aarch64-unknown-linux-gnu" ; op2 = tbl(op1 dup_x(idx)) -> op2 = vector_splat(extractelement(op1, idx)) define @dup_ext_i8( %data) #0 { +; ; CHECK-LABEL: @dup_ext_i8( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i8 1 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP1]], i32 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: ret [[DOTSPLAT]] +; CHECK-NEXT: [[OUT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: ret [[OUT]] ; %tmp = call @llvm.aarch64.sve.dup.x.nxv16i8(i8 1) %out = call @llvm.aarch64.sve.tbl.nxv16i8( %data, %tmp) @@ -17,11 +19,12 @@ define @dup_ext_i8( %data) #0 { } define @dup_ext_i16( %data) #0 { +; ; CHECK-LABEL: @dup_ext_i16( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i16 1 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i16 [[TMP1]], i32 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: ret [[DOTSPLAT]] +; CHECK-NEXT: [[OUT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: ret [[OUT]] ; %tmp = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 1) %out = call @llvm.aarch64.sve.tbl.nxv8i16( %data, %tmp) @@ -29,11 +32,12 @@ define @dup_ext_i16( %data) #0 { } define @dup_ext_i32( %data) #0 { +; ; CHECK-LABEL: @dup_ext_i32( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i32 1 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: ret [[DOTSPLAT]] +; CHECK-NEXT: [[OUT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: ret [[OUT]] ; %tmp = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) %out = call @llvm.aarch64.sve.tbl.nxv4i32( %data, %tmp) @@ -41,11 +45,12 @@ define @dup_ext_i32( %data) #0 { } define @dup_ext_i64( %data) #0 { +; ; CHECK-LABEL: @dup_ext_i64( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i64 1 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP1]], i32 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: ret [[DOTSPLAT]] +; CHECK-NEXT: [[OUT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: ret [[OUT]] ; %tmp = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 1) %out = call @llvm.aarch64.sve.tbl.nxv2i64( %data, %tmp) @@ -53,11 +58,12 @@ define @dup_ext_i64( %data) #0 { } define @dup_ext_f16( %data) #0 { +; ; CHECK-LABEL: @dup_ext_f16( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i16 1 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, half [[TMP1]], i32 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: ret [[DOTSPLAT]] +; CHECK-NEXT: [[OUT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: ret [[OUT]] ; %tmp = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 1) %out = call @llvm.aarch64.sve.tbl.nxv8f16( %data, %tmp) @@ -65,11 +71,12 @@ define @dup_ext_f16( %data) #0 { } define @dup_ext_f32( %data) #0 { +; ; CHECK-LABEL: @dup_ext_f32( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i32 1 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: ret [[DOTSPLAT]] +; CHECK-NEXT: [[OUT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: ret [[OUT]] ; %tmp = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) %out = call @llvm.aarch64.sve.tbl.nxv4f32( %data, %tmp) @@ -77,11 +84,12 @@ define @dup_ext_f32( %data) #0 { } define @dup_ext_f64( %data) #0 { +; ; CHECK-LABEL: @dup_ext_f64( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i64 1 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, double [[TMP1]], i32 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: ret [[DOTSPLAT]] +; CHECK-NEXT: [[OUT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: ret [[OUT]] ; %tmp = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 1) %out = call @llvm.aarch64.sve.tbl.nxv2f64( %data, %tmp) diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll index e2a8b27d21d71d..b54222cfecddcf 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll @@ -1,165 +1,183 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck %s ; -------------------------------------------------------------------- ; llvm.amdgcn.buffer.load ; -------------------------------------------------------------------- -; CHECK-LABEL: @buffer_load_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret float %data define amdgpu_ps float @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @buffer_load_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret float [[DATA]] +; %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) ret float %data } -; CHECK-LABEL: @buffer_load_v1f32( -; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret <1 x float> %data define amdgpu_ps <1 x float> @buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @buffer_load_v1f32( +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret <1 x float> [[DATA]] +; %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) ret <1 x float> %data } -; CHECK-LABEL: @buffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) ret <2 x float> %data } -; CHECK-LABEL: @buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret <4 x float> %data define amdgpu_ps <4 x float> @buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) ret <4 x float> %data } -; CHECK-LABEL: @extract_elt0_buffer_load_v2f32( -; CHECK: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_buffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_buffer_load_v4f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_buffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt2_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt3_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt3_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt0 = extractelement <2 x float> %data, i32 0 -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: %ins0 = insertvalue { float, float } undef, float %elt0, 0 -; CHECK-NEXT: %ins1 = insertvalue { float, float } %ins0, float %elt1, 1 -; CHECK-NEXT: ret { float, float } %ins1 define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x float> [[DATA]], i32 0 +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: [[INS0:%.*]] = insertvalue { float, float } undef, float [[ELT0]], 0 +; CHECK-NEXT: [[INS1:%.*]] = insertvalue { float, float } [[INS0]], float [[ELT1]], 1 +; CHECK-NEXT: ret { float, float } [[INS1]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 %elt1 = extractelement <4 x float> %data, i32 1 @@ -168,16 +186,17 @@ define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i3 ret { float, float } %ins1 } -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_2( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt0 = extractelement <3 x float> %data, i32 0 -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 1 -; CHECK-NEXT: %elt2 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: %ins0 = insertvalue { float, float, float } undef, float %elt0, 0 -; CHECK-NEXT: %ins1 = insertvalue { float, float, float } %ins0, float %elt1, 1 -; CHECK-NEXT: %ins2 = insertvalue { float, float, float } %ins1, float %elt2, 2 -; CHECK-NEXT: ret { float, float, float } %ins2 define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_2( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <3 x float> [[DATA]], i32 0 +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 1 +; CHECK-NEXT: [[ELT2:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: [[INS0:%.*]] = insertvalue { float, float, float } undef, float [[ELT0]], 0 +; CHECK-NEXT: [[INS1:%.*]] = insertvalue { float, float, float } [[INS0]], float [[ELT1]], 1 +; CHECK-NEXT: [[INS2:%.*]] = insertvalue { float, float, float } [[INS1]], float [[ELT2]], 2 +; CHECK-NEXT: ret { float, float, float } [[INS2]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 %elt1 = extractelement <4 x float> %data, i32 1 @@ -188,12 +207,14 @@ define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f ret { float, float, float } %ins2 } -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_3( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %ins1 = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: %ret = fadd <2 x float> %ins1, %shuf define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_3( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> undef, <2 x i32> +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]] +; CHECK-NEXT: ret <2 x float> [[RET]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 %elt2 = extractelement <4 x float> %data, i32 2 @@ -204,13 +225,14 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(<4 x i3 ret <2 x float> %ret } -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_4( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %ins1 = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: %ret = fadd <2 x float> %ins1, %shuf -; CHECK-NEXT: ret <2 x float> %ret define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_4( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> undef, <2 x i32> +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]] +; CHECK-NEXT: ret <2 x float> [[RET]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 %elt2 = extractelement <4 x float> %data, i32 2 @@ -221,12 +243,14 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(<4 x i3 ret <2 x float> %ret } -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_5( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %ins1 = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: %ret = fadd <2 x float> %ins1, %shuf define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_5( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> undef, <2 x i32> +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]] +; CHECK-NEXT: ret <2 x float> [[RET]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt2 = extractelement <4 x float> %data, i32 2 %ins0 = insertelement <2 x float> poison, float %elt2, i32 0 @@ -236,58 +260,64 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(<4 x i3 ret <2 x float> %ret } -; CHECK-LABEL: @extract_elt0_buffer_load_v3f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_buffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_buffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt2_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0 -; CHECK-NEXT: ret float %data define amdgpu_ps float @preserve_metadata_extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 @@ -303,76 +333,83 @@ declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) ; llvm.amdgcn.buffer.load.format ; -------------------------------------------------------------------- -; CHECK-LABEL: @buffer_load_format_v1f32( -; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true) -; CHECK-NEXT: ret <1 x float> %data define amdgpu_ps <1 x float> @buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @buffer_load_format_v1f32( +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 true) +; CHECK-NEXT: ret <1 x float> [[DATA]] +; %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true) ret <1 x float> %data } -; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 true, i1 false) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } ; The initial insertion point is at the extractelement -; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32( -; CHECK-NEXT: %tmp = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) -; CHECK-NEXT: %1 = shufflevector <2 x float> %tmp, <2 x float> poison, <4 x i32> -; CHECK-NEXT: %tmp1 = bitcast <4 x float> %1 to <2 x double> -; CHECK-NEXT: %tmp2 = extractelement <2 x double> %tmp1, i32 0 -; CHECK-NEXT: ret double %tmp2 define double @extract01_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { - %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 - %tmp1 = bitcast <4 x float> %tmp to <2 x double> - %tmp2 = extractelement <2 x double> %tmp1, i32 0 - ret double %tmp2 +; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[VAR]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[VAR1:%.*]] = bitcast <4 x float> [[TMP1]] to <2 x double> +; CHECK-NEXT: [[VAR2:%.*]] = extractelement <2 x double> [[VAR1]], i32 0 +; CHECK-NEXT: ret double [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 + %var1 = bitcast <4 x float> %var to <2 x double> + %var2 = extractelement <2 x double> %var1, i32 0 + ret double %var2 } -; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) -; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 -; CHECK-NEXT: ret i32 %tmp2 define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { - %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 - %tmp1 = bitcast <4 x float> %tmp to <4 x i32> - %tmp2 = extractelement <4 x i32> %tmp1, i32 0 - ret i32 %tmp2 +; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 } -; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) -; CHECK-NEXT: %1 = bitcast float %tmp to i32 -; CHECK-NEXT: %tmp2 = trunc i32 %1 to i16 -; CHECK-NEXT: ret i16 %tmp2 define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { - %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 - %tmp1 = bitcast <4 x float> %tmp to <8 x i16> - %tmp2 = extractelement <8 x i16> %tmp1, i32 0 - ret i16 %tmp2 +; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: [[VAR2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: ret i16 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 + %var1 = bitcast <4 x float> %var to <8 x i16> + %var2 = extractelement <8 x i16> %var1, i32 0 + ret i16 %var2 } declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1 @@ -385,228 +422,252 @@ declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i ; llvm.amdgcn.raw.buffer.load ; -------------------------------------------------------------------- -; CHECK-LABEL: @raw_buffer_load_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @raw_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @raw_buffer_load_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) ret float %data } -; CHECK-LABEL: @raw_buffer_load_v1f32( -; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <1 x float> %data define amdgpu_ps <1 x float> @raw_buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @raw_buffer_load_v1f32( +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <1 x float> [[DATA]] +; %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <1 x float> %data } -; CHECK-LABEL: @raw_buffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @raw_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <2 x float> %data } -; CHECK-LABEL: @raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <4 x float> %data define amdgpu_ps <4 x float> @raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @raw_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <4 x float> %data } -; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f32( -; CHECK: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_raw_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_raw_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_raw_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 12 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_raw_buffer_load_v3f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_raw_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_raw_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_raw_buffer_load_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 -; CHECK-NEXT: ret i32 %tmp2 define i32 @extract0_bitcast_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { - %tmp = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) - %tmp1 = bitcast <4 x float> %tmp to <4 x i32> - %tmp2 = extractelement <4 x i32> %tmp1, i32 0 - ret i32 %tmp2 +; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 } -; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4i32( -; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float -; CHECK-NEXT: ret float %tmp2 define float @extract0_bitcast_raw_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { - %tmp = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) - %tmp1 = bitcast <4 x i32> %tmp to <4 x float> - %tmp2 = extractelement <4 x float> %tmp1, i32 0 - ret float %tmp2 +; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4i32( +; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast i32 [[VAR]] to float +; CHECK-NEXT: ret float [[VAR2]] +; + %var = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %var1 = bitcast <4 x i32> %var to <4 x float> + %var2 = extractelement <4 x float> %var1, i32 0 + ret float %var2 } -; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 -; CHECK-NEXT: ret float %data define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 @@ -620,59 +681,65 @@ declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) #1 -; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f16( -; CHECK: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt0_raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f16( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x half> %data, i32 0 ret half %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f16( -; CHECK-NEXT: %1 = add i32 %ofs, 2 -; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x half> %data, i32 1 ret half %elt1 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f16( -; CHECK-NEXT: %1 = add i32 %ofs, 2 -; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_raw_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x half> %data, i32 1 ret half %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f16( -; CHECK-NEXT: %1 = add i32 %ofs, 2 -; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 1 ret half %elt1 } -; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f16( -; CHECK-NEXT: %1 = add i32 %ofs, 6 -; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt3_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 3 ret half %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f16( -; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x half> define amdgpu_ps <2 x half> @extract_elt0_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f16( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x half> [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> ret <2 x half> %shuf @@ -683,59 +750,65 @@ declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32>, i32, i32, i32) declare <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32>, i32, i32, i32) #1 declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32>, i32, i32, i32) #1 -; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2i8( -; CHECK: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt0_raw_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2i8( +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x i8> %data, i32 0 ret i8 %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2i8( -; CHECK-NEXT: %1 = add i32 %ofs, 1 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x i8> %data, i32 1 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3i8( -; CHECK-NEXT: %1 = add i32 %ofs, 1 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x i8> %data, i32 1 ret i8 %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4i8( -; CHECK-NEXT: %1 = add i32 %ofs, 1 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 1 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4i8( -; CHECK-NEXT: %1 = add i32 %ofs, 3 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt3_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 3 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4i8( -; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x i8> define amdgpu_ps <2 x i8> @extract_elt0_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4i8( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x i8> [[DATA]] +; %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> ret <2 x i8> %shuf @@ -750,179 +823,198 @@ declare <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32>, i32, i32, i32) #1 ; llvm.amdgcn.s.buffer.load ; -------------------------------------------------------------------- -; CHECK-LABEL: @s_buffer_load_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @s_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @s_buffer_load_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) ret float %data } -; CHECK-LABEL: @s_buffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @s_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) ret <2 x float> %data } -; CHECK-LABEL: @s_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret <4 x float> %data define amdgpu_ps <4 x float> @s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @s_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) ret <4 x float> %data } -; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f32( -; CHECK: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_s_buffer_load_v4f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_s_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_s_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt2_s_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 12 -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt2_elt3_s_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt2_elt3_s_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_s_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_s_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt2_elt3_s_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_s_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_s_buffer_load_v3f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_s_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_s_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt2_s_buffer_load_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf @@ -930,42 +1022,46 @@ define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v3f32(<4 x i32> in ; Do not trim to vec3 s_buffer_load in instcombine, as the load will most likely be widened ; to vec4 anyway during lowering. -; CHECK-LABEL: @extract_elt1_elt2_elt3_s_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_s_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 -; CHECK-NEXT: ret i32 %tmp2 define i32 @extract0_bitcast_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { - %tmp = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) - %tmp1 = bitcast <4 x float> %tmp to <4 x i32> - %tmp2 = extractelement <4 x i32> %tmp1, i32 0 - ret i32 %tmp2 +; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 } -; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4i32( -; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float -; CHECK-NEXT: ret float %tmp2 define float @extract0_bitcast_s_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { - %tmp = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 0) - %tmp1 = bitcast <4 x i32> %tmp to <4 x float> - %tmp2 = extractelement <4 x float> %tmp1, i32 0 - ret float %tmp2 +; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4i32( +; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast i32 [[VAR]] to float +; CHECK-NEXT: ret float [[VAR2]] +; + %var = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %var1 = bitcast <4 x i32> %var to <4 x float> + %var2 = extractelement <4 x float> %var1, i32 0 + ret float %var2 } -; CHECK-LABEL: @preserve_metadata_extract_elt0_s_buffer_load_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0), !fpmath !0 -; CHECK-NEXT: ret float %data define amdgpu_ps float @preserve_metadata_extract_elt0_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_s_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 @@ -977,60 +1073,66 @@ declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32) #1 declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) #1 -; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f16( -; CHECK: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt0_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f16( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <2 x half> %data, i32 0 ret half %elt0 } -; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f16( -; CHECK-NEXT: %1 = add i32 %ofs, 2 -; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <2 x half> %data, i32 1 ret half %elt1 } -; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f16( -; CHECK-NEXT: %1 = add i32 %ofs, 2 -; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_s_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <3 x half> %data, i32 1 ret half %elt1 } -; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f16( -; CHECK-NEXT: %1 = add i32 %ofs, 2 -; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 1 ret half %elt1 } -; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f16( -; CHECK-NEXT: %1 = add i32 %ofs, 6 -; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt3_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 3 ret half %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f16( -; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret <2 x half> define amdgpu_ps <2 x half> @extract_elt0_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f16( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x half> [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> ret <2 x half> %shuf @@ -1041,59 +1143,65 @@ declare <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32>, i32, i32) #1 declare <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32>, i32, i32) #1 declare <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32>, i32, i32) #1 -; CHECK-LABEL: @extract_elt0_s_buffer_load_v2i8( -; CHECK: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt0_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_s_buffer_load_v2i8( +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <2 x i8> %data, i32 0 ret i8 %elt0 } -; CHECK-LABEL: @extract_elt1_s_buffer_load_v2i8( -; CHECK-NEXT: %1 = add i32 %ofs, 1 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt1_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_v2i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <2 x i8> %data, i32 1 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt1_s_buffer_load_v3i8( -; CHECK-NEXT: %1 = add i32 %ofs, 1 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt1_s_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_v3i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <3 x i8> %data, i32 1 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt1_s_buffer_load_v4i8( -; CHECK-NEXT: %1 = add i32 %ofs, 1 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 1 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt3_s_buffer_load_v4i8( -; CHECK-NEXT: %1 = add i32 %ofs, 3 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt3_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt3_s_buffer_load_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 3 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4i8( -; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret <2 x i8> define amdgpu_ps <2 x i8> @extract_elt0_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4i8( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x i8> [[DATA]] +; %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> ret <2 x i8> %shuf @@ -1108,226 +1216,250 @@ declare <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32>, i32, i32) #1 ; llvm.amdgcn.raw.buffer.load.format ; -------------------------------------------------------------------- -; CHECK-LABEL: @raw_buffer_load_format_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @raw_buffer_load_format_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @raw_buffer_load_format_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) ret float %data } -; CHECK-LABEL: @raw_buffer_load_format_v1f32( -; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <1 x float> %data define amdgpu_ps <1 x float> @raw_buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @raw_buffer_load_format_v1f32( +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <1 x float> [[DATA]] +; %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <1 x float> %data } -; CHECK-LABEL: @raw_buffer_load_format_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @raw_buffer_load_format_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <2 x float> %data } -; CHECK-LABEL: @raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <4 x float> %data define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <4 x float> %data } -; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v2f32( -; CHECK: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt3_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 -; CHECK-NEXT: ret i32 %tmp2 define i32 @extract0_bitcast_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { - %tmp = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) - %tmp1 = bitcast <4 x float> %tmp to <4 x i32> - %tmp2 = extractelement <4 x i32> %tmp1, i32 0 - ret i32 %tmp2 +; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 } -; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4i32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %tmp define float @extract0_bitcast_raw_buffer_load_format_v4i32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { - %tmp = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) - %tmp1 = extractelement <4 x float> %tmp, i32 0 - ret float %tmp1 +; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4i32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[VAR]] +; + %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %var1 = extractelement <4 x float> %var, i32 0 + ret float %var1 } -; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 -; CHECK-NEXT: ret float %data define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 @@ -1343,228 +1475,252 @@ declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i3 ; llvm.amdgcn.struct.buffer.load ; -------------------------------------------------------------------- -; CHECK-LABEL: @struct_buffer_load_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @struct_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @struct_buffer_load_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret float %data } -; CHECK-LABEL: @struct_buffer_load_v1f32( -; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <1 x float> %data define amdgpu_ps <1 x float> @struct_buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @struct_buffer_load_v1f32( +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <1 x float> [[DATA]] +; %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <1 x float> %data } -; CHECK-LABEL: @struct_buffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @struct_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <2 x float> %data } -; CHECK-LABEL: @struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <4 x float> %data define amdgpu_ps <4 x float> @struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @struct_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <4 x float> %data } -; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f32( -; CHECK: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_struct_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_struct_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_struct_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 12 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_struct_buffer_load_v3f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_struct_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_struct_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_struct_buffer_load_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 -; CHECK-NEXT: ret i32 %tmp2 define i32 @extract0_bitcast_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { - %tmp = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) - %tmp1 = bitcast <4 x float> %tmp to <4 x i32> - %tmp2 = extractelement <4 x i32> %tmp1, i32 0 - ret i32 %tmp2 +; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 } -; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4i32( -; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float -; CHECK-NEXT: ret float %tmp2 define float @extract0_bitcast_struct_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { - %tmp = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) - %tmp1 = bitcast <4 x i32> %tmp to <4 x float> - %tmp2 = extractelement <4 x float> %tmp1, i32 0 - ret float %tmp2 +; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4i32( +; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast i32 [[VAR]] to float +; CHECK-NEXT: ret float [[VAR2]] +; + %var = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %var1 = bitcast <4 x i32> %var to <4 x float> + %var2 = extractelement <4 x float> %var1, i32 0 + ret float %var2 } -; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 -; CHECK-NEXT: ret float %data define amdgpu_ps float @preserve_metadata_extract_elt0_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 @@ -1578,59 +1734,65 @@ declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1 -; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f16( -; CHECK: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt0_struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f16( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x half> %data, i32 0 ret half %elt0 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f16( -; CHECK-NEXT: %1 = add i32 %ofs, 2 -; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x half> %data, i32 1 ret half %elt1 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f16( -; CHECK-NEXT: %1 = add i32 %ofs, 2 -; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_struct_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x half> %data, i32 1 ret half %elt1 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f16( -; CHECK-NEXT: %1 = add i32 %ofs, 2 -; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 1 ret half %elt1 } -; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f16( -; CHECK-NEXT: %1 = add i32 %ofs, 6 -; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt3_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 3 ret half %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f16( -; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x half> define amdgpu_ps <2 x half> @extract_elt0_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f16( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x half> [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> ret <2 x half> %shuf @@ -1641,59 +1803,65 @@ declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32>, i32, i32, i3 declare <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32>, i32, i32, i32, i32) #1 declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) #1 -; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2i8( -; CHECK: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt0_struct_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2i8( +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x i8> %data, i32 0 ret i8 %elt0 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2i8( -; CHECK-NEXT: %1 = add i32 %ofs, 1 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x i8> %data, i32 1 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3i8( -; CHECK-NEXT: %1 = add i32 %ofs, 1 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x i8> %data, i32 1 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4i8( -; CHECK-NEXT: %1 = add i32 %ofs, 1 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 1 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4i8( -; CHECK-NEXT: %1 = add i32 %ofs, 3 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt3_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 3 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4i8( -; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x i8> define amdgpu_ps <2 x i8> @extract_elt0_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4i8( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x i8> [[DATA]] +; %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> ret <2 x i8> %shuf @@ -1708,217 +1876,240 @@ declare <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32>, i32, i32, i32, ; llvm.amdgcn.struct.buffer.load.format ; -------------------------------------------------------------------- -; CHECK-LABEL: @struct_buffer_load_format_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @struct_buffer_load_format_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @struct_buffer_load_format_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret float %data } -; CHECK-LABEL: @struct_buffer_load_format_v1f32( -; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <1 x float> %data define amdgpu_ps <1 x float> @struct_buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @struct_buffer_load_format_v1f32( +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <1 x float> [[DATA]] +; %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <1 x float> %data } -; CHECK-LABEL: @struct_buffer_load_format_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @struct_buffer_load_format_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <2 x float> %data } -; CHECK-LABEL: @struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <4 x float> %data define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <4 x float> %data } -; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v2f32( -; CHECK: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt3_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 -; CHECK-NEXT: ret i32 %tmp2 define i32 @extract0_bitcast_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { - %tmp = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) - %tmp1 = bitcast <4 x float> %tmp to <4 x i32> - %tmp2 = extractelement <4 x i32> %tmp1, i32 0 - ret i32 %tmp2 +; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 } -; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 -; CHECK-NEXT: ret float %data define amdgpu_ps float @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 @@ -1936,209 +2127,231 @@ declare <4 x i32> @llvm.amdgcn.struct.buffer.load.format.v4i32(<4 x i32>, i32, i ; llvm.amdgcn.raw.tbuffer.load ; -------------------------------------------------------------------- -; CHECK-LABEL: @raw_tbuffer_load_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @raw_tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @raw_tbuffer_load_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) ret float %data } -; CHECK-LABEL: @raw_tbuffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @raw_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) ret <2 x float> %data } -; CHECK-LABEL: @raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret <4 x float> %data define amdgpu_ps <4 x float> @raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) ret <4 x float> %data } -; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v2f32( -; CHECK: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt2_elt3_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt2_elt3_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract0_bitcast_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 -; CHECK-NEXT: ret i32 %tmp2 define i32 @extract0_bitcast_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { - %tmp = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) - %tmp1 = bitcast <4 x float> %tmp to <4 x i32> - %tmp2 = extractelement <4 x i32> %tmp1, i32 0 - ret i32 %tmp2 +; CHECK-LABEL: @extract0_bitcast_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 } -; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_tbuffer_load_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0), !fpmath !0 -; CHECK-NEXT: ret float %data define amdgpu_ps float @preserve_metadata_extract_elt0_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 @@ -2151,40 +2364,44 @@ declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32 declare <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1 -; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f16( -; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <4 x half> %data, i32 3 -; CHECK-NEXT: ret half %elt1 define amdgpu_ps half @extract_elt3_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f16( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[DATA]], i32 3 +; CHECK-NEXT: ret half [[ELT1]] +; %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x half> %data, i32 3 ret half %elt1 } -; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f16( -; CHECK-NEXT: %data = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x half> %data, i32 2 -; CHECK-NEXT: ret half %elt1 define amdgpu_ps half @extract_elt2_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f16( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x half> [[DATA]], i32 2 +; CHECK-NEXT: ret half [[ELT1]] +; %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x half> %data, i32 2 ret half %elt1 } -; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f16( -; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x half> %data, i32 1 -; CHECK-NEXT: ret half %elt1 define amdgpu_ps half @extract_elt1_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f16( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[DATA]], i32 1 +; CHECK-NEXT: ret half [[ELT1]] +; %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x half> %data, i32 1 ret half %elt1 } -; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f16( -; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt0_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f16( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x half> %data, i32 0 ret half %elt1 @@ -2199,209 +2416,231 @@ declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, ; llvm.amdgcn.struct.tbuffer.load ; -------------------------------------------------------------------- -; CHECK-LABEL: @struct_tbuffer_load_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @struct_tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @struct_tbuffer_load_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) ret float %data } -; CHECK-LABEL: @struct_tbuffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @struct_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) ret <2 x float> %data } -; CHECK-LABEL: @struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: ret <4 x float> %data define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) ret <4 x float> %data } -; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v2f32( -; CHECK: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt3_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt3_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt2_elt3_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt2_elt3_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract0_bitcast_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 -; CHECK-NEXT: ret i32 %tmp2 define i32 @extract0_bitcast_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { - %tmp = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) - %tmp1 = bitcast <4 x float> %tmp to <4 x i32> - %tmp2 = extractelement <4 x i32> %tmp1, i32 0 - ret i32 %tmp2 +; CHECK-LABEL: @extract0_bitcast_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 } -; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_tbuffer_load_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0), !fpmath !0 -; CHECK-NEXT: ret float %data define amdgpu_ps float @preserve_metadata_extract_elt0_struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 @@ -2418,209 +2657,231 @@ declare <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32>, i32, i32, i3 ; llvm.amdgcn.tbuffer.load ; -------------------------------------------------------------------- -; CHECK-LABEL: @tbuffer_load_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret float %data define amdgpu_ps float @tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @tbuffer_load_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: ret float [[DATA]] +; %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ret float %data } -; CHECK-LABEL: @tbuffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ret <2 x float> %data } -; CHECK-LABEL: @tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <4 x float> %data define amdgpu_ps <4 x float> @tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ret <4 x float> %data } -; CHECK-LABEL: @extract_elt0_tbuffer_load_v2f32( -; CHECK: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt0_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_tbuffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt1_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt0_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt1_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt2_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt3_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt2_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt2_elt3_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt2_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt0_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt1_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt2_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract0_bitcast_tbuffer_load_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 -; CHECK-NEXT: ret i32 %tmp2 define i32 @extract0_bitcast_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { - %tmp = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %tmp1 = bitcast <4 x float> %tmp to <4 x i32> - %tmp2 = extractelement <4 x i32> %tmp1, i32 0 - ret i32 %tmp2 +; CHECK-LABEL: @extract0_bitcast_tbuffer_load_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 } -; CHECK-LABEL: @preserve_metadata_extract_elt0_tbuffer_load_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0 -; CHECK-NEXT: ret float %data define amdgpu_ps float @preserve_metadata_extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 @@ -2638,20 +2899,24 @@ declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, ; llvm.amdgcn.image.sample ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } ; Check that the intrinsic remains unchanged in the presence of TFE or LWE -; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_tfe( -; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0) -; CHECK: ret float %elt0 define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_tfe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_tfe( +; CHECK-NEXT: [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 1, i32 0) +; CHECK-NEXT: [[DATA_VEC:%.*]] = extractvalue { <4 x float>, i32 } [[DATA]], 0 +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA_VEC]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0) %data.vec = extractvalue {<4 x float>,i32} %data, 0 %elt0 = extractelement <4 x float> %data.vec, i32 0 @@ -2659,176 +2924,197 @@ define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_tfe(float %vaddr, } ; Check that the intrinsic remains unchanged in the presence of TFE or LWE -; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_lwe( -; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0) -; CHECK: ret float %elt0 define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_lwe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_lwe( +; CHECK-NEXT: [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 2, i32 0) +; CHECK-NEXT: [[DATA_VEC:%.*]] = extractvalue { <4 x float>, i32 } [[DATA]], 0 +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA_VEC]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0) %data.vec = extractvalue {<4 x float>,i32} %data, 0 %elt0 = extractelement <4 x float> %data.vec, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_image_sample_2d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_2d_v4f32_f32(float %s, float %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32( -; CHECK-NEXT: ret float undef define amdgpu_ps float @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32(float %s, float %t, float %r, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32( +; CHECK-NEXT: ret float undef +; %data = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 0, float %s, float %t, float %r, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_dmask_0001_image_sample_1darray_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1darray.f32.f32(i32 1, float %s, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_dmask_0001_image_sample_1darray_v4f32_f32(float %s, float %slice, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_dmask_0001_image_sample_1darray_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1darray.f32.f32(i32 1, float [[S:%.*]], float [[SLICE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 1, float %s, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_dmask_0010_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 2, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_dmask_0010_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_dmask_0010_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 2, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 2, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_dmask_0100_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 4, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_dmask_0100_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_dmask_0100_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 4, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 4, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_dmask_1000_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 8, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_dmask_1000_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_dmask_1000_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 8, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 8, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_dmask_1001_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_dmask_1001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_dmask_1001_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 9, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_dmask_0011_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_dmask_0011_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: %1 = insertelement <2 x float> undef, float %data, i32 0 -; CHECK-NEXT: ret <2 x float> %1 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> undef, float [[DATA]], i32 0 +; CHECK-NEXT: ret <2 x float> [[TMP1]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: %1 = insertelement <3 x float> undef, float %data, i32 0 -; CHECK-NEXT: ret <3 x float> %1 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x float> undef, float [[DATA]], i32 0 +; CHECK-NEXT: ret <3 x float> [[TMP1]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: %shuf = shufflevector <2 x float> %data, <2 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x float> [[DATA]], <2 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: %shuf = shufflevector <2 x float> %data, <2 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x float> [[DATA]], <2 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf @@ -2844,10 +3130,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, floa ; llvm.amdgcn.image.sample.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt1_image_sample_cl_2darray_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cl.2darray.f32.f32(i32 2, float %s, float %t, float %slice, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_image_sample_cl_2darray_v4f32_f32(float %s, float %t, float %slice, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt1_image_sample_cl_2darray_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cl.2darray.f32.f32(i32 2, float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32(i32 15, float %s, float %t, float %slice, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 1 ret float %elt0 @@ -2859,10 +3146,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32(i32, float, f ; llvm.amdgcn.image.sample.d ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt2_image_sample_d_cube_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.cube.f32.f32.f32(i32 4, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_image_sample_d_cube_v4f32_f32_f32(float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt2_image_sample_d_cube_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cube.f32.f32.f32(i32 4, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[FACE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 2 ret float %elt0 @@ -2874,10 +3162,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32(i32, float, f ; llvm.amdgcn.image.sample.d.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt3_image_sample_d_cl_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.cl.1d.f32.f32.f32(i32 8, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt3_image_sample_d_cl_1d_v4f32_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt3_image_sample_d_cl_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cl.1d.f32.f32.f32(i32 8, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 3 ret float %elt0 @@ -2889,10 +3178,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32, float, ; llvm.amdgcn.image.sample.l ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt1_dmask_0110_image_sample_l_1d_v2f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.l.1d.f32.f32(i32 4, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_dmask_0110_image_sample_l_1d_v2f32_f32(float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt1_dmask_0110_image_sample_l_1d_v2f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.l.1d.f32.f32(i32 4, float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32(i32 6, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <2 x float> %data, i32 1 ret float %elt0 @@ -2904,10 +3194,11 @@ declare <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32(i32, float, float, ; llvm.amdgcn.image.sample.b ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt1_dmask_1001_image_sample_b_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.b.1d.f32.f32.f32(i32 8, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_dmask_1001_image_sample_b_1d_v4f32_f32_f32(float %bias, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt1_dmask_1001_image_sample_b_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.1d.f32.f32.f32(i32 8, float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 9, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 1 ret float %elt0 @@ -2919,10 +3210,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32, float, flo ; llvm.amdgcn.image.sample.b.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.b.cl.1d.v2f32.f32.f32(i32 12, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32(float %bias, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.b.cl.1d.v2f32.f32.f32(i32 12, float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 13, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf @@ -2934,10 +3226,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32, float, ; llvm.amdgcn.image.sample.lz ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.lz.1d.v2f32.f32(i32 10, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.1d.v2f32.f32(i32 10, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf @@ -2949,10 +3242,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32, float, <8 x i ; llvm.amdgcn.image.sample.cd ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.image.sample.cd.1d.v3f32.f32.f32(i32 14, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32(float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.cd.1d.v3f32.f32.f32(i32 14, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf @@ -2964,67 +3258,74 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, fl ; llvm.amdgcn.image.sample.cd.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32( -; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 8, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 8, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x half> %data, i32 3 ret half %elt0 } -; CHECK-LABEL: @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32( -; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 4, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 4, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x half> %data, i32 2 ret half %elt0 } -; CHECK-LABEL: @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32( -; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 2, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 2, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x half> %data, i32 1 ret half %elt0 } -; CHECK-LABEL: @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32( -; CHECK-NEXT: %data = call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32(i32 7, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: %res = shufflevector <3 x half> %data, <3 x half> poison, <4 x i32> -; CHECK-NEXT: ret <4 x half> %res define amdgpu_ps <4 x half> @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32(i32 7, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[RES:%.*]] = shufflevector <3 x half> [[DATA]], <3 x half> poison, <4 x i32> +; CHECK-NEXT: ret <4 x half> [[RES]] +; %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> ret <4 x half> %res } -; CHECK-LABEL: @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32( -; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v2f16.f32.f32(i32 3, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: %res = shufflevector <2 x half> %data, <2 x half> poison, <4 x i32> -; CHECK-NEXT: ret <4 x half> %res define amdgpu_ps <4 x half> @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v2f16.f32.f32(i32 3, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[RES:%.*]] = shufflevector <2 x half> [[DATA]], <2 x half> poison, <4 x i32> +; CHECK-NEXT: ret <4 x half> [[RES]] +; %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> ret <4 x half> %res } -; CHECK-LABEL: @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32( -; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: %res = insertelement <4 x half> poison, half %data, i64 0 -; CHECK-NEXT: ret <4 x half> %res define amdgpu_ps <4 x half> @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[RES:%.*]] = insertelement <4 x half> poison, half [[DATA]], i64 0 +; CHECK-NEXT: ret <4 x half> [[RES]] +; %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> ret <4 x half> %res } -; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32( -; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x half> %data, i32 0 ret half %elt0 @@ -3036,10 +3337,11 @@ declare <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32, float, ; llvm.amdgcn.image.sample.c ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.1d.f32.f32(i32 1, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_1d_v4f32_f32(float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3051,10 +3353,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float, ; llvm.amdgcn.image.sample.c.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_cl_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cl.1d.f32.f32(i32 1, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_cl_1d_v4f32_f32(float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_cl_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cl.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3066,10 +3369,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32, float, floa ; llvm.amdgcn.image.sample.c.d ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_d_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_d_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_d_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3081,10 +3385,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32, float, f ; llvm.amdgcn.image.sample.c.d.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.cl.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_d_cl_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.cl.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3096,10 +3401,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32, float ; llvm.amdgcn.image.sample.c.l ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_l_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.l.1d.f32.f32(i32 1, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_l_1d_v4f32_f32(float %zcompare, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_l_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.l.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3111,10 +3417,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32, float, float ; llvm.amdgcn.image.sample.c.b ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_b_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.1d.f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_b_1d_v4f32_f32_f32(float %bias, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_b_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.1d.f32.f32.f32(i32 1, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3126,10 +3433,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32, float, f ; llvm.amdgcn.image.sample.c.b.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.cl.1d.f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_b_cl_1d_v4f32_f32_f32(float %bias, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.cl.1d.f32.f32.f32(i32 1, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3141,10 +3449,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32, float ; llvm.amdgcn.image.sample.c.lz ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_lz_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.lz.1d.f32.f32(i32 1, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_lz_1d_v4f32_f32(float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_lz_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.lz.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3156,10 +3465,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, floa ; llvm.amdgcn.image.sample.c.cd ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_cd_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_cd_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_cd_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3171,10 +3481,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32, float, ; llvm.amdgcn.image.sample.c.cd.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.cl.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_cd_cl_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.cl.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3186,10 +3497,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32, floa ; llvm.amdgcn.image.sample.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_o_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.o.1d.f32.f32(i32 1, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_o_1d_v4f32_f32(i32 %offset, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_o_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3201,10 +3513,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32, i32, float, <8 ; llvm.amdgcn.image.sample.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_cl_o_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cl.o.1d.f32.f32(i32 1, i32 %offset, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_cl_o_1d_v4f32_f32(i32 %offset, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_cl_o_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cl.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3216,10 +3529,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32, i32, float, ; llvm.amdgcn.image.sample.d.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_d_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_d_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_d_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3231,10 +3545,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32, i32, flo ; llvm.amdgcn.image.sample.d.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_d_cl_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_d_cl_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_d_cl_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3246,10 +3561,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32, i32, ; llvm.amdgcn.image.sample.l.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_l_o_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.l.o.1d.f32.f32(i32 1, i32 %offset, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_l_o_1d_v4f32_f32(i32 %offset, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_l_o_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.l.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3261,10 +3577,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32, i32, float, ; llvm.amdgcn.image.sample.b.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_b_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.b.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_b_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_b_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3276,10 +3593,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32, i32, flo ; llvm.amdgcn.image.sample.b.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_b_cl_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.b.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_b_cl_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_b_cl_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3291,10 +3609,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32, i32, ; llvm.amdgcn.image.sample.lz.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_lz_o_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.lz.o.1d.f32.f32(i32 1, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_lz_o_1d_v4f32_f32(i32 %offset, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_lz_o_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.lz.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3306,10 +3625,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32, i32, float, ; llvm.amdgcn.image.sample.cd.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_cd_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cd.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_cd_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_cd_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cd.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3321,10 +3641,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32, i32, fl ; llvm.amdgcn.image.sample.cd.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cd.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_cd_cl_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cd.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3336,10 +3657,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32, i32, ; llvm.amdgcn.image.sample.c.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_o_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_o_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3351,10 +3673,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32, i32, float, ; llvm.amdgcn.image.sample.c.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_cl_o_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cl.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_cl_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_cl_o_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cl.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3366,10 +3689,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32, i32, floa ; llvm.amdgcn.image.sample.c.d.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_d_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_d_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_d_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3381,10 +3705,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32, i32, f ; llvm.amdgcn.image.sample.c.d.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_d_cl_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3396,10 +3721,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32, i32 ; llvm.amdgcn.image.sample.c.l.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_l_o_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.l.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_l_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_l_o_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.l.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3411,10 +3737,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32, i32, float ; llvm.amdgcn.image.sample.c.b.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_b_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_b_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_b_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3426,10 +3753,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32, i32, f ; llvm.amdgcn.image.sample.c.b.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_b_cl_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3441,10 +3769,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32, i32 ; llvm.amdgcn.image.sample.c.lz.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_lz_o_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.lz.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_lz_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_lz_o_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.lz.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3456,10 +3785,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32, i32, floa ; llvm.amdgcn.image.sample.c.cd.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_cd_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_cd_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_cd_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3471,10 +3801,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32, i32, ; llvm.amdgcn.image.sample.c.cd.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_cd_cl_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3488,9 +3819,12 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32, i3 ; Don't handle gather4* -; CHECK-LABEL: @extract_elt0_image_gather4_2d_v4f32_f32( -; CHECK: %data = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_2d_v4f32_f32(float %s, float %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3502,9 +3836,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, < ; llvm.amdgcn.image.gather4.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_cl_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 2, float %s, float %t, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_cl_2d_v4f32_f32(float %s, float %t, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_cl_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 2, float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 2, float %s, float %t, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3516,9 +3853,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32, float, float ; llvm.amdgcn.image.gather4.l ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_l_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 4, float %s, float %t, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_l_2d_v4f32_f32(float %s, float %t, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_l_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 4, float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 4, float %s, float %t, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3530,9 +3870,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32, float, float, ; llvm.amdgcn.image.gather4.b ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_b_2darray_v4f32_f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 8, float %bias, float %s, float %t, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_b_2darray_v4f32_f32_f32(float %bias, float %s, float %t, float %slice, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_b_2darray_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 8, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 8, float %bias, float %s, float %t, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3544,9 +3887,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32, floa ; llvm.amdgcn.image.gather4.b.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_cube_v4f32_f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_b_cl_cube_v4f32_f32_f32(float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_cube_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32 1, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[FACE:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3558,9 +3904,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32, floa ; llvm.amdgcn.image.gather4.lz ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_lz_2d_v4f32_f16( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_lz_2d_v4f32_f16(half %s, half %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_lz_2d_v4f32_f16( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3572,9 +3921,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32, half, half, ; llvm.amdgcn.image.gather4.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_o_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_o_2d_v4f32_f32(i32 %offset, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_o_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3586,9 +3938,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32, i32, float, f ; llvm.amdgcn.image.gather4.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_cl_o_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_cl_o_2d_v4f32_f32(i32 %offset, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_cl_o_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3600,9 +3955,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32, i32, float ; llvm.amdgcn.image.gather4.l.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_l_o_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_l_o_2d_v4f32_f32(i32 %offset, float %s, float %t, float %lod, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_l_o_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3614,9 +3972,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32, i32, float, ; llvm.amdgcn.image.gather4.b.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_b_o_2d_v4f32_f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_b_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_b_o_2d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3628,9 +3989,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32, i32, fl ; llvm.amdgcn.image.gather4.b.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_o_2d_v4f32_f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_b_cl_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_o_2d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3642,9 +4006,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32, i32, ; llvm.amdgcn.image.gather4.lz.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_lz_o_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_lz_o_2d_v4f32_f32(i32 %offset, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_lz_o_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3656,9 +4023,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32, i32, float ; llvm.amdgcn.image.gather4.c.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_c_o_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_c_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_c_o_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3670,9 +4040,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32, i32, float, ; llvm.amdgcn.image.gather4.c.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_c_cl_o_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_c_cl_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_c_cl_o_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3684,9 +4057,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32, i32, flo ; llvm.amdgcn.image.gather4.c.l.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_c_l_o_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_c_l_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_c_l_o_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3698,9 +4074,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32, i32, floa ; llvm.amdgcn.image.gather4.c.b.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_c_b_o_2d_v4f32_f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_c_b_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_c_b_o_2d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3712,9 +4091,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32, i32, ; llvm.amdgcn.image.gather4.c.b.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_c_b_cl_o_2d_v4f32_f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_c_b_cl_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_c_b_cl_o_2d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3726,9 +4108,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32, i3 ; llvm.amdgcn.image.gather4.c.lz.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_c_lz_o_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_c_lz_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_c_lz_o_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3740,10 +4125,11 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32, i32, flo ; llvm.amdgcn.image.getlod ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_getlod_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.getlod.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_getlod_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_getlod_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.getlod.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3755,10 +4141,11 @@ declare <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32, float, <8 x i32> ; llvm.amdgcn.image.load ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_load_2dmsaa_v4f32_i32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 %sample, <8 x i32> %sampler, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_load_2dmsaa_v4f32_i32(i32 %s, i32 %t, i32 %sample, <8 x i32> inreg %sampler) #0 { +; CHECK-LABEL: @extract_elt0_image_load_2dmsaa_v4f32_i32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SAMPLE:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %sample, <8 x i32> %sampler, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3770,10 +4157,11 @@ declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, ; llvm.amdgcn.image.load.mip ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_load_mip_1d_v4f32_i32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.load.mip.1d.f32.i32(i32 1, i32 %s, i32 %mip, <8 x i32> %sampler, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_load_mip_1d_v4f32_i32(i32 %s, i32 %mip, <8 x i32> inreg %sampler) #0 { +; CHECK-LABEL: @extract_elt0_image_load_mip_1d_v4f32_i32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.load.mip.1d.f32.i32(i32 1, i32 [[S:%.*]], i32 [[MIP:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %sampler, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3785,10 +4173,11 @@ declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x ; llvm.amdgcn.image.getresinfo ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_getresinfo_1d_v4f32_i32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.getresinfo.1d.f32.i32(i32 1, i32 %mip, <8 x i32> %sampler, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_getresinfo_1d_v4f32_i32(i32 %mip, <8 x i32> inreg %sampler) #0 { +; CHECK-LABEL: @extract_elt0_image_getresinfo_1d_v4f32_i32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.getresinfo.1d.f32.i32(i32 1, i32 [[MIP:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %sampler, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3800,9 +4189,13 @@ declare <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32, i32, <8 x i3 ; TFE / LWE ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_tfe_image_load_1d_v4f32i32_i32( -; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1) define amdgpu_ps float @extract_elt0_tfe_image_load_1d_v4f32i32_i32(i32 %s, <8 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_tfe_image_load_1d_v4f32i32_i32( +; CHECK-NEXT: [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 1) +; CHECK-NEXT: [[RGBA:%.*]] = extractvalue { <4 x float>, i32 } [[DATA]], 0 +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[RGBA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1) %rgba = extractvalue { <4 x float>, i32 } %data, 0 %elt0 = extractelement <4 x float> %rgba, i32 0 @@ -3811,10 +4204,11 @@ define amdgpu_ps float @extract_elt0_tfe_image_load_1d_v4f32i32_i32(i32 %s, <8 x declare {<4 x float>, i32} @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32, i32, <8 x i32>, i32, i32) #1 -; CHECK: @tfe_check_assert( -; CHECK: %data = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1) -; CHECK-NEXT: ret float %data define amdgpu_hs float @tfe_check_assert() #0 { +; CHECK-LABEL: @tfe_check_assert( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1) +; CHECK-NEXT: ret float [[DATA]] +; %data = call nsz <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1) #2 %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll index 855d396ed63d69..999b574c0affc7 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll @@ -1,165 +1,183 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck %s ; -------------------------------------------------------------------- ; llvm.amdgcn.buffer.load ; -------------------------------------------------------------------- -; CHECK-LABEL: @buffer_load_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret float %data define amdgpu_ps float @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @buffer_load_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret float [[DATA]] +; %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) ret float %data } -; CHECK-LABEL: @buffer_load_v1f32( -; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret <1 x float> %data define amdgpu_ps <1 x float> @buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @buffer_load_v1f32( +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret <1 x float> [[DATA]] +; %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) ret <1 x float> %data } -; CHECK-LABEL: @buffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) ret <2 x float> %data } -; CHECK-LABEL: @buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret <4 x float> %data define amdgpu_ps <4 x float> @buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) ret <4 x float> %data } -; CHECK-LABEL: @extract_elt0_buffer_load_v2f32( -; CHECK: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_buffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_buffer_load_v4f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_buffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt2_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt3_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt3_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt0 = extractelement <2 x float> %data, i32 0 -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: %ins0 = insertvalue { float, float } undef, float %elt0, 0 -; CHECK-NEXT: %ins1 = insertvalue { float, float } %ins0, float %elt1, 1 -; CHECK-NEXT: ret { float, float } %ins1 define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x float> [[DATA]], i32 0 +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: [[INS0:%.*]] = insertvalue { float, float } undef, float [[ELT0]], 0 +; CHECK-NEXT: [[INS1:%.*]] = insertvalue { float, float } [[INS0]], float [[ELT1]], 1 +; CHECK-NEXT: ret { float, float } [[INS1]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 %elt1 = extractelement <4 x float> %data, i32 1 @@ -168,16 +186,17 @@ define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i3 ret { float, float } %ins1 } -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_2( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt0 = extractelement <3 x float> %data, i32 0 -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 1 -; CHECK-NEXT: %elt2 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: %ins0 = insertvalue { float, float, float } undef, float %elt0, 0 -; CHECK-NEXT: %ins1 = insertvalue { float, float, float } %ins0, float %elt1, 1 -; CHECK-NEXT: %ins2 = insertvalue { float, float, float } %ins1, float %elt2, 2 -; CHECK-NEXT: ret { float, float, float } %ins2 define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_2( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <3 x float> [[DATA]], i32 0 +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 1 +; CHECK-NEXT: [[ELT2:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: [[INS0:%.*]] = insertvalue { float, float, float } undef, float [[ELT0]], 0 +; CHECK-NEXT: [[INS1:%.*]] = insertvalue { float, float, float } [[INS0]], float [[ELT1]], 1 +; CHECK-NEXT: [[INS2:%.*]] = insertvalue { float, float, float } [[INS1]], float [[ELT2]], 2 +; CHECK-NEXT: ret { float, float, float } [[INS2]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 %elt1 = extractelement <4 x float> %data, i32 1 @@ -188,12 +207,14 @@ define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f ret { float, float, float } %ins2 } -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_3( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %ins1 = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: %ret = fadd <2 x float> %ins1, %shuf define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_3( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> undef, <2 x i32> +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]] +; CHECK-NEXT: ret <2 x float> [[RET]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 %elt2 = extractelement <4 x float> %data, i32 2 @@ -204,13 +225,14 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(<4 x i3 ret <2 x float> %ret } -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_4( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %ins1 = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: %ret = fadd <2 x float> %ins1, %shuf -; CHECK-NEXT: ret <2 x float> %ret define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_4( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> undef, <2 x i32> +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]] +; CHECK-NEXT: ret <2 x float> [[RET]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 %elt2 = extractelement <4 x float> %data, i32 2 @@ -221,12 +243,14 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(<4 x i3 ret <2 x float> %ret } -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_5( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %ins1 = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: %ret = fadd <2 x float> %ins1, %shuf define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_5( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> undef, <2 x i32> +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]] +; CHECK-NEXT: ret <2 x float> [[RET]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt2 = extractelement <4 x float> %data, i32 2 %ins0 = insertelement <2 x float> undef, float %elt2, i32 0 @@ -236,58 +260,64 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(<4 x i3 ret <2 x float> %ret } -; CHECK-LABEL: @extract_elt0_buffer_load_v3f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_buffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_buffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt2_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0 -; CHECK-NEXT: ret float %data define amdgpu_ps float @preserve_metadata_extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 @@ -303,76 +333,83 @@ declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) ; llvm.amdgcn.buffer.load.format ; -------------------------------------------------------------------- -; CHECK-LABEL: @buffer_load_format_v1f32( -; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true) -; CHECK-NEXT: ret <1 x float> %data define amdgpu_ps <1 x float> @buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @buffer_load_format_v1f32( +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 true) +; CHECK-NEXT: ret <1 x float> [[DATA]] +; %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true) ret <1 x float> %data } -; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 true, i1 false) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } ; The initial insertion point is at the extractelement -; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32( -; CHECK-NEXT: %tmp = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) -; CHECK-NEXT: %1 = shufflevector <2 x float> %tmp, <2 x float> poison, <4 x i32> -; CHECK-NEXT: %tmp1 = bitcast <4 x float> %1 to <2 x double> -; CHECK-NEXT: %tmp2 = extractelement <2 x double> %tmp1, i32 0 -; CHECK-NEXT: ret double %tmp2 define double @extract01_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { - %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 - %tmp1 = bitcast <4 x float> %tmp to <2 x double> - %tmp2 = extractelement <2 x double> %tmp1, i32 0 - ret double %tmp2 +; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[VAR]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[VAR1:%.*]] = bitcast <4 x float> [[TMP1]] to <2 x double> +; CHECK-NEXT: [[VAR2:%.*]] = extractelement <2 x double> [[VAR1]], i32 0 +; CHECK-NEXT: ret double [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 + %var1 = bitcast <4 x float> %var to <2 x double> + %var2 = extractelement <2 x double> %var1, i32 0 + ret double %var2 } -; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) -; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 -; CHECK-NEXT: ret i32 %tmp2 define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { - %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 - %tmp1 = bitcast <4 x float> %tmp to <4 x i32> - %tmp2 = extractelement <4 x i32> %tmp1, i32 0 - ret i32 %tmp2 +; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 } -; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) -; CHECK-NEXT: %1 = bitcast float %tmp to i32 -; CHECK-NEXT: %tmp2 = trunc i32 %1 to i16 -; CHECK-NEXT: ret i16 %tmp2 define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { - %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 - %tmp1 = bitcast <4 x float> %tmp to <8 x i16> - %tmp2 = extractelement <8 x i16> %tmp1, i32 0 - ret i16 %tmp2 +; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: [[VAR2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: ret i16 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 + %var1 = bitcast <4 x float> %var to <8 x i16> + %var2 = extractelement <8 x i16> %var1, i32 0 + ret i16 %var2 } declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1 @@ -385,228 +422,252 @@ declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i ; llvm.amdgcn.raw.buffer.load ; -------------------------------------------------------------------- -; CHECK-LABEL: @raw_buffer_load_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @raw_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @raw_buffer_load_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) ret float %data } -; CHECK-LABEL: @raw_buffer_load_v1f32( -; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <1 x float> %data define amdgpu_ps <1 x float> @raw_buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @raw_buffer_load_v1f32( +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <1 x float> [[DATA]] +; %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <1 x float> %data } -; CHECK-LABEL: @raw_buffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @raw_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <2 x float> %data } -; CHECK-LABEL: @raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <4 x float> %data define amdgpu_ps <4 x float> @raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @raw_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <4 x float> %data } -; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f32( -; CHECK: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_raw_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_raw_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_raw_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 12 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_raw_buffer_load_v3f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_raw_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_raw_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_raw_buffer_load_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 -; CHECK-NEXT: ret i32 %tmp2 define i32 @extract0_bitcast_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { - %tmp = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) - %tmp1 = bitcast <4 x float> %tmp to <4 x i32> - %tmp2 = extractelement <4 x i32> %tmp1, i32 0 - ret i32 %tmp2 +; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 } -; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4i32( -; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float -; CHECK-NEXT: ret float %tmp2 define float @extract0_bitcast_raw_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { - %tmp = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) - %tmp1 = bitcast <4 x i32> %tmp to <4 x float> - %tmp2 = extractelement <4 x float> %tmp1, i32 0 - ret float %tmp2 +; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4i32( +; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast i32 [[VAR]] to float +; CHECK-NEXT: ret float [[VAR2]] +; + %var = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %var1 = bitcast <4 x i32> %var to <4 x float> + %var2 = extractelement <4 x float> %var1, i32 0 + ret float %var2 } -; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 -; CHECK-NEXT: ret float %data define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 @@ -620,59 +681,65 @@ declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) #1 -; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f16( -; CHECK: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt0_raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f16( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x half> %data, i32 0 ret half %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f16( -; CHECK-NEXT: %1 = add i32 %ofs, 2 -; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x half> %data, i32 1 ret half %elt1 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f16( -; CHECK-NEXT: %1 = add i32 %ofs, 2 -; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_raw_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x half> %data, i32 1 ret half %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f16( -; CHECK-NEXT: %1 = add i32 %ofs, 2 -; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 1 ret half %elt1 } -; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f16( -; CHECK-NEXT: %1 = add i32 %ofs, 6 -; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt3_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 3 ret half %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f16( -; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x half> define amdgpu_ps <2 x half> @extract_elt0_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f16( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x half> [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> ret <2 x half> %shuf @@ -683,59 +750,65 @@ declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32>, i32, i32, i32) declare <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32>, i32, i32, i32) #1 declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32>, i32, i32, i32) #1 -; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2i8( -; CHECK: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt0_raw_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2i8( +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x i8> %data, i32 0 ret i8 %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2i8( -; CHECK-NEXT: %1 = add i32 %ofs, 1 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x i8> %data, i32 1 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3i8( -; CHECK-NEXT: %1 = add i32 %ofs, 1 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x i8> %data, i32 1 ret i8 %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4i8( -; CHECK-NEXT: %1 = add i32 %ofs, 1 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 1 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4i8( -; CHECK-NEXT: %1 = add i32 %ofs, 3 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt3_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 3 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4i8( -; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x i8> define amdgpu_ps <2 x i8> @extract_elt0_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4i8( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x i8> [[DATA]] +; %data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> ret <2 x i8> %shuf @@ -750,179 +823,198 @@ declare <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32>, i32, i32, i32) #1 ; llvm.amdgcn.s.buffer.load ; -------------------------------------------------------------------- -; CHECK-LABEL: @s_buffer_load_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @s_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @s_buffer_load_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) ret float %data } -; CHECK-LABEL: @s_buffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @s_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) ret <2 x float> %data } -; CHECK-LABEL: @s_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret <4 x float> %data define amdgpu_ps <4 x float> @s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @s_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) ret <4 x float> %data } -; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f32( -; CHECK: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_s_buffer_load_v4f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_s_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_s_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt2_s_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 12 -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt2_elt3_s_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt2_elt3_s_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_s_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_s_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt2_elt3_s_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_s_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_s_buffer_load_v3f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_s_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_s_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt2_s_buffer_load_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf @@ -930,42 +1022,46 @@ define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v3f32(<4 x i32> in ; Do not trim to vec3 s_buffer_load in instcombine, as the load will most likely be widened ; to vec4 anyway during lowering. -; CHECK-LABEL: @extract_elt1_elt2_elt3_s_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_s_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 -; CHECK-NEXT: ret i32 %tmp2 define i32 @extract0_bitcast_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { - %tmp = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) - %tmp1 = bitcast <4 x float> %tmp to <4 x i32> - %tmp2 = extractelement <4 x i32> %tmp1, i32 0 - ret i32 %tmp2 +; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 } -; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4i32( -; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float -; CHECK-NEXT: ret float %tmp2 define float @extract0_bitcast_s_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { - %tmp = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 0) - %tmp1 = bitcast <4 x i32> %tmp to <4 x float> - %tmp2 = extractelement <4 x float> %tmp1, i32 0 - ret float %tmp2 +; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4i32( +; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast i32 [[VAR]] to float +; CHECK-NEXT: ret float [[VAR2]] +; + %var = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 0) + %var1 = bitcast <4 x i32> %var to <4 x float> + %var2 = extractelement <4 x float> %var1, i32 0 + ret float %var2 } -; CHECK-LABEL: @preserve_metadata_extract_elt0_s_buffer_load_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0), !fpmath !0 -; CHECK-NEXT: ret float %data define amdgpu_ps float @preserve_metadata_extract_elt0_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_s_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 @@ -977,60 +1073,66 @@ declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32) #1 declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) #1 -; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f16( -; CHECK: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt0_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f16( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <2 x half> %data, i32 0 ret half %elt0 } -; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f16( -; CHECK-NEXT: %1 = add i32 %ofs, 2 -; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <2 x half> %data, i32 1 ret half %elt1 } -; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f16( -; CHECK-NEXT: %1 = add i32 %ofs, 2 -; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_s_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <3 x half> %data, i32 1 ret half %elt1 } -; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f16( -; CHECK-NEXT: %1 = add i32 %ofs, 2 -; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 1 ret half %elt1 } -; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f16( -; CHECK-NEXT: %1 = add i32 %ofs, 6 -; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt3_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 3 ret half %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f16( -; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret <2 x half> define amdgpu_ps <2 x half> @extract_elt0_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f16( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x half> [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> ret <2 x half> %shuf @@ -1041,59 +1143,65 @@ declare <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32>, i32, i32) #1 declare <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32>, i32, i32) #1 declare <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32>, i32, i32) #1 -; CHECK-LABEL: @extract_elt0_s_buffer_load_v2i8( -; CHECK: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt0_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_s_buffer_load_v2i8( +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt0 = extractelement <2 x i8> %data, i32 0 ret i8 %elt0 } -; CHECK-LABEL: @extract_elt1_s_buffer_load_v2i8( -; CHECK-NEXT: %1 = add i32 %ofs, 1 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt1_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_v2i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <2 x i8> %data, i32 1 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt1_s_buffer_load_v3i8( -; CHECK-NEXT: %1 = add i32 %ofs, 1 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt1_s_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_v3i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <3 x i8> %data, i32 1 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt1_s_buffer_load_v4i8( -; CHECK-NEXT: %1 = add i32 %ofs, 1 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt1_s_buffer_load_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 1 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt3_s_buffer_load_v4i8( -; CHECK-NEXT: %1 = add i32 %ofs, 3 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt3_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt3_s_buffer_load_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 3 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4i8( -; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0) -; CHECK-NEXT: ret <2 x i8> define amdgpu_ps <2 x i8> @extract_elt0_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4i8( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x i8> [[DATA]] +; %data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0) %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> ret <2 x i8> %shuf @@ -1108,226 +1216,250 @@ declare <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32>, i32, i32) #1 ; llvm.amdgcn.raw.buffer.load.format ; -------------------------------------------------------------------- -; CHECK-LABEL: @raw_buffer_load_format_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @raw_buffer_load_format_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @raw_buffer_load_format_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) ret float %data } -; CHECK-LABEL: @raw_buffer_load_format_v1f32( -; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <1 x float> %data define amdgpu_ps <1 x float> @raw_buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @raw_buffer_load_format_v1f32( +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <1 x float> [[DATA]] +; %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <1 x float> %data } -; CHECK-LABEL: @raw_buffer_load_format_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @raw_buffer_load_format_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <2 x float> %data } -; CHECK-LABEL: @raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <4 x float> %data define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) ret <4 x float> %data } -; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v2f32( -; CHECK: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt3_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 -; CHECK-NEXT: ret i32 %tmp2 define i32 @extract0_bitcast_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { - %tmp = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) - %tmp1 = bitcast <4 x float> %tmp to <4 x i32> - %tmp2 = extractelement <4 x i32> %tmp1, i32 0 - ret i32 %tmp2 +; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 } -; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4i32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %tmp define float @extract0_bitcast_raw_buffer_load_format_v4i32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { - %tmp = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) - %tmp1 = extractelement <4 x float> %tmp, i32 0 - ret float %tmp1 +; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4i32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[VAR]] +; + %var = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0) + %var1 = extractelement <4 x float> %var, i32 0 + ret float %var1 } -; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 -; CHECK-NEXT: ret float %data define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 @@ -1343,228 +1475,252 @@ declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i3 ; llvm.amdgcn.struct.buffer.load ; -------------------------------------------------------------------- -; CHECK-LABEL: @struct_buffer_load_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @struct_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @struct_buffer_load_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret float %data } -; CHECK-LABEL: @struct_buffer_load_v1f32( -; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <1 x float> %data define amdgpu_ps <1 x float> @struct_buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @struct_buffer_load_v1f32( +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <1 x float> [[DATA]] +; %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <1 x float> %data } -; CHECK-LABEL: @struct_buffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @struct_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <2 x float> %data } -; CHECK-LABEL: @struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <4 x float> %data define amdgpu_ps <4 x float> @struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @struct_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <4 x float> %data } -; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f32( -; CHECK: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_struct_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_struct_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_struct_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 12 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 12 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_v4f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_struct_buffer_load_v3f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_struct_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_struct_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 8 -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_struct_buffer_load_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 8 +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v3f32( -; CHECK-NEXT: %1 = add i32 %ofs, 4 -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v3f32( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4 +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 -; CHECK-NEXT: ret i32 %tmp2 define i32 @extract0_bitcast_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { - %tmp = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) - %tmp1 = bitcast <4 x float> %tmp to <4 x i32> - %tmp2 = extractelement <4 x i32> %tmp1, i32 0 - ret i32 %tmp2 +; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 } -; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4i32( -; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float -; CHECK-NEXT: ret float %tmp2 define float @extract0_bitcast_struct_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { - %tmp = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) - %tmp1 = bitcast <4 x i32> %tmp to <4 x float> - %tmp2 = extractelement <4 x float> %tmp1, i32 0 - ret float %tmp2 +; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4i32( +; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast i32 [[VAR]] to float +; CHECK-NEXT: ret float [[VAR2]] +; + %var = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %var1 = bitcast <4 x i32> %var to <4 x float> + %var2 = extractelement <4 x float> %var1, i32 0 + ret float %var2 } -; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 -; CHECK-NEXT: ret float %data define amdgpu_ps float @preserve_metadata_extract_elt0_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 @@ -1578,59 +1734,65 @@ declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1 -; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f16( -; CHECK: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt0_struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f16( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x half> %data, i32 0 ret half %elt0 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f16( -; CHECK-NEXT: %1 = add i32 %ofs, 2 -; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x half> %data, i32 1 ret half %elt1 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f16( -; CHECK-NEXT: %1 = add i32 %ofs, 2 -; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_struct_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x half> %data, i32 1 ret half %elt1 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f16( -; CHECK-NEXT: %1 = add i32 %ofs, 2 -; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 1 ret half %elt1 } -; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f16( -; CHECK-NEXT: %1 = add i32 %ofs, 6 -; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt3_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f16( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6 +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x half> %data, i32 3 ret half %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f16( -; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x half> define amdgpu_ps <2 x half> @extract_elt0_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f16( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x half> [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> ret <2 x half> %shuf @@ -1641,59 +1803,65 @@ declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32>, i32, i32, i3 declare <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32>, i32, i32, i32, i32) #1 declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) #1 -; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2i8( -; CHECK: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt0_struct_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2i8( +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x i8> %data, i32 0 ret i8 %elt0 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2i8( -; CHECK-NEXT: %1 = add i32 %ofs, 1 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x i8> %data, i32 1 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3i8( -; CHECK-NEXT: %1 = add i32 %ofs, 1 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x i8> %data, i32 1 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4i8( -; CHECK-NEXT: %1 = add i32 %ofs, 1 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 1 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4i8( -; CHECK-NEXT: %1 = add i32 %ofs, 3 -; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0) -; CHECK-NEXT: ret i8 %data define amdgpu_ps i8 @extract_elt3_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3 +; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[TMP1]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret i8 [[DATA]] +; %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x i8> %data, i32 3 ret i8 %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4i8( -; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x i8> define amdgpu_ps <2 x i8> @extract_elt0_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4i8( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x i8> [[DATA]] +; %data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x i8> %data, <4 x i8> poison, <2 x i32> ret <2 x i8> %shuf @@ -1708,217 +1876,240 @@ declare <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32>, i32, i32, i32, ; llvm.amdgcn.struct.buffer.load.format ; -------------------------------------------------------------------- -; CHECK-LABEL: @struct_buffer_load_format_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @struct_buffer_load_format_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @struct_buffer_load_format_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret float %data } -; CHECK-LABEL: @struct_buffer_load_format_v1f32( -; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <1 x float> %data define amdgpu_ps <1 x float> @struct_buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @struct_buffer_load_format_v1f32( +; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <1 x float> [[DATA]] +; %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <1 x float> %data } -; CHECK-LABEL: @struct_buffer_load_format_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @struct_buffer_load_format_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <2 x float> %data } -; CHECK-LABEL: @struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <4 x float> %data define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) ret <4 x float> %data } -; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v2f32( -; CHECK: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt3_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_format_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) -; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 -; CHECK-NEXT: ret i32 %tmp2 define i32 @extract0_bitcast_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { - %tmp = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) - %tmp1 = bitcast <4 x float> %tmp to <4 x i32> - %tmp2 = extractelement <4 x i32> %tmp1, i32 0 - ret i32 %tmp2 +; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_format_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0) + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 } -; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 -; CHECK-NEXT: ret float %data define amdgpu_ps float @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 @@ -1936,209 +2127,231 @@ declare <4 x i32> @llvm.amdgcn.struct.buffer.load.format.v4i32(<4 x i32>, i32, i ; llvm.amdgcn.raw.tbuffer.load ; -------------------------------------------------------------------- -; CHECK-LABEL: @raw_tbuffer_load_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @raw_tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @raw_tbuffer_load_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) ret float %data } -; CHECK-LABEL: @raw_tbuffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @raw_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) ret <2 x float> %data } -; CHECK-LABEL: @raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret <4 x float> %data define amdgpu_ps <4 x float> @raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) ret <4 x float> %data } -; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v2f32( -; CHECK: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt2_elt3_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt2_elt3_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt0_elt1_raw_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt1_elt2_raw_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <3 x float> @llvm.amdgcn.raw.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract0_bitcast_raw_tbuffer_load_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 -; CHECK-NEXT: ret i32 %tmp2 define i32 @extract0_bitcast_raw_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { - %tmp = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) - %tmp1 = bitcast <4 x float> %tmp to <4 x i32> - %tmp2 = extractelement <4 x i32> %tmp1, i32 0 - ret i32 %tmp2 +; CHECK-LABEL: @extract0_bitcast_raw_tbuffer_load_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 } -; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_tbuffer_load_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0), !fpmath !0 -; CHECK-NEXT: ret float %data define amdgpu_ps float @preserve_metadata_extract_elt0_raw_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.raw.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 @@ -2151,40 +2364,44 @@ declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32 declare <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1 -; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f16( -; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <4 x half> %data, i32 3 -; CHECK-NEXT: ret half %elt1 define amdgpu_ps half @extract_elt3_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f16( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[DATA]], i32 3 +; CHECK-NEXT: ret half [[ELT1]] +; %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x half> %data, i32 3 ret half %elt1 } -; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f16( -; CHECK-NEXT: %data = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x half> %data, i32 2 -; CHECK-NEXT: ret half %elt1 define amdgpu_ps half @extract_elt2_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f16( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x half> [[DATA]], i32 2 +; CHECK-NEXT: ret half [[ELT1]] +; %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x half> %data, i32 2 ret half %elt1 } -; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f16( -; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x half> %data, i32 1 -; CHECK-NEXT: ret half %elt1 define amdgpu_ps half @extract_elt1_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f16( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[DATA]], i32 1 +; CHECK-NEXT: ret half [[ELT1]] +; %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x half> %data, i32 1 ret half %elt1 } -; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f16( -; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt0_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f16( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) %elt1 = extractelement <4 x half> %data, i32 0 ret half %elt1 @@ -2199,209 +2416,231 @@ declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, ; llvm.amdgcn.struct.tbuffer.load ; -------------------------------------------------------------------- -; CHECK-LABEL: @struct_tbuffer_load_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @struct_tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @struct_tbuffer_load_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) ret float %data } -; CHECK-LABEL: @struct_tbuffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @struct_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) ret <2 x float> %data } -; CHECK-LABEL: @struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: ret <4 x float> %data define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) ret <4 x float> %data } -; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v2f32( -; CHECK: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt3_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt3_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt2_elt3_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt2_elt3_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt0_struct_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt1_struct_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt2_struct_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt0_elt1_struct_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @extract_elt1_elt2_struct_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <3 x float> @llvm.amdgcn.struct.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract0_bitcast_struct_tbuffer_load_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) -; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 -; CHECK-NEXT: ret i32 %tmp2 define i32 @extract0_bitcast_struct_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { - %tmp = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) - %tmp1 = bitcast <4 x float> %tmp to <4 x i32> - %tmp2 = extractelement <4 x i32> %tmp1, i32 0 - ret i32 %tmp2 +; CHECK-LABEL: @extract0_bitcast_struct_tbuffer_load_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0) + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 } -; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_tbuffer_load_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0), !fpmath !0 -; CHECK-NEXT: ret float %data define amdgpu_ps float @preserve_metadata_extract_elt0_struct_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1, i32 inreg %arg2) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[ARG2:%.*]], i32 78, i32 0), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.struct.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 %arg2, i32 78, i32 0), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 @@ -2418,209 +2657,231 @@ declare <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32>, i32, i32, i3 ; llvm.amdgcn.tbuffer.load ; -------------------------------------------------------------------- -; CHECK-LABEL: @tbuffer_load_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret float %data define amdgpu_ps float @tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @tbuffer_load_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: ret float [[DATA]] +; %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ret float %data } -; CHECK-LABEL: @tbuffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ret <2 x float> %data } -; CHECK-LABEL: @tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <4 x float> %data define amdgpu_ps <4 x float> @tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: ret <4 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) ret <4 x float> %data } -; CHECK-LABEL: @extract_elt0_tbuffer_load_v2f32( -; CHECK: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt0_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_tbuffer_load_v2f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt1_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <2 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt0_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt1_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt2_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt3_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i32 3 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <4 x float> %data, i32 3 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt2_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt2_elt3_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt2_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt0_elt2_elt3_tbuffer_load_v4f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt0_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt0 = extractelement <3 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt1_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt1_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i32 1 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 1 ret float %elt1 } -; CHECK-LABEL: @extract_elt2_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2 -; CHECK-NEXT: ret float %elt1 define amdgpu_ps float @extract_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt2_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i32 2 +; CHECK-NEXT: ret float [[ELT1]] +; %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %elt1 = extractelement <3 x float> %data, i32 2 ret float %elt1 } -; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v3f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> %shuf define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v3f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: ret <2 x float> [[SHUF]] +; %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract0_bitcast_tbuffer_load_v4f32( -; CHECK-NEXT: %tmp = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 -; CHECK-NEXT: ret i32 %tmp2 define i32 @extract0_bitcast_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { - %tmp = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %tmp1 = bitcast <4 x float> %tmp to <4 x i32> - %tmp2 = extractelement <4 x i32> %tmp1, i32 0 - ret i32 %tmp2 +; CHECK-LABEL: @extract0_bitcast_tbuffer_load_v4f32( +; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) +; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 +; CHECK-NEXT: ret i32 [[VAR2]] +; + %var = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) + %var1 = bitcast <4 x float> %var to <4 x i32> + %var2 = extractelement <4 x i32> %var1, i32 0 + ret i32 %var2 } -; CHECK-LABEL: @preserve_metadata_extract_elt0_tbuffer_load_v2f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0 -; CHECK-NEXT: ret float %data define amdgpu_ps float @preserve_metadata_extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { +; CHECK-LABEL: @preserve_metadata_extract_elt0_tbuffer_load_v2f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0 +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0 %elt0 = extractelement <2 x float> %data, i32 0 ret float %elt0 @@ -2638,20 +2899,24 @@ declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, ; llvm.amdgcn.image.sample ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } ; Check that the intrinsic remains unchanged in the presence of TFE or LWE -; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_tfe( -; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0) -; CHECK: ret float %elt0 define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_tfe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_tfe( +; CHECK-NEXT: [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 1, i32 0) +; CHECK-NEXT: [[DATA_VEC:%.*]] = extractvalue { <4 x float>, i32 } [[DATA]], 0 +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA_VEC]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0) %data.vec = extractvalue {<4 x float>,i32} %data, 0 %elt0 = extractelement <4 x float> %data.vec, i32 0 @@ -2659,176 +2924,197 @@ define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_tfe(float %vaddr, } ; Check that the intrinsic remains unchanged in the presence of TFE or LWE -; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_lwe( -; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0) -; CHECK: ret float %elt0 define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_lwe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_lwe( +; CHECK-NEXT: [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float [[VADDR:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 2, i32 0) +; CHECK-NEXT: [[DATA_VEC:%.*]] = extractvalue { <4 x float>, i32 } [[DATA]], 0 +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA_VEC]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0) %data.vec = extractvalue {<4 x float>,i32} %data, 0 %elt0 = extractelement <4 x float> %data.vec, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_image_sample_2d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_2d_v4f32_f32(float %s, float %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32( -; CHECK-NEXT: ret float undef define amdgpu_ps float @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32(float %s, float %t, float %r, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32( +; CHECK-NEXT: ret float undef +; %data = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 0, float %s, float %t, float %r, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_dmask_0001_image_sample_1darray_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1darray.f32.f32(i32 1, float %s, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_dmask_0001_image_sample_1darray_v4f32_f32(float %s, float %slice, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_dmask_0001_image_sample_1darray_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1darray.f32.f32(i32 1, float [[S:%.*]], float [[SLICE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 1, float %s, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_dmask_0010_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 2, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_dmask_0010_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_dmask_0010_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 2, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 2, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_dmask_0100_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 4, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_dmask_0100_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_dmask_0100_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 4, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 4, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_dmask_1000_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 8, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_dmask_1000_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_dmask_1000_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 8, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 8, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_dmask_1001_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_dmask_1001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_dmask_1001_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 9, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_dmask_0011_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_dmask_0011_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 } -; CHECK-LABEL: @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: %1 = insertelement <2 x float> undef, float %data, i32 0 -; CHECK-NEXT: ret <2 x float> %1 define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> undef, float [[DATA]], i32 0 +; CHECK-NEXT: ret <2 x float> [[TMP1]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: %1 = insertelement <3 x float> undef, float %data, i32 0 -; CHECK-NEXT: ret <3 x float> %1 define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x float> undef, float [[DATA]], i32 0 +; CHECK-NEXT: ret <3 x float> [[TMP1]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: %shuf = shufflevector <2 x float> %data, <2 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x float> [[DATA]], <2 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: %shuf = shufflevector <2 x float> %data, <2 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> %shuf define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x float> [[DATA]], <2 x float> poison, <3 x i32> +; CHECK-NEXT: ret <3 x float> [[SHUF]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf } -; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf @@ -2844,10 +3130,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, floa ; llvm.amdgcn.image.sample.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt1_image_sample_cl_2darray_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cl.2darray.f32.f32(i32 2, float %s, float %t, float %slice, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_image_sample_cl_2darray_v4f32_f32(float %s, float %t, float %slice, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt1_image_sample_cl_2darray_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cl.2darray.f32.f32(i32 2, float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32(i32 15, float %s, float %t, float %slice, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 1 ret float %elt0 @@ -2859,10 +3146,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32(i32, float, f ; llvm.amdgcn.image.sample.d ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt2_image_sample_d_cube_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.cube.f32.f32.f32(i32 4, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt2_image_sample_d_cube_v4f32_f32_f32(float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt2_image_sample_d_cube_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cube.f32.f32.f32(i32 4, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S:%.*]], float [[T:%.*]], float [[FACE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 2 ret float %elt0 @@ -2874,10 +3162,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32(i32, float, f ; llvm.amdgcn.image.sample.d.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt3_image_sample_d_cl_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.cl.1d.f32.f32.f32(i32 8, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt3_image_sample_d_cl_1d_v4f32_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt3_image_sample_d_cl_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cl.1d.f32.f32.f32(i32 8, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 3 ret float %elt0 @@ -2889,10 +3178,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32, float, ; llvm.amdgcn.image.sample.l ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt1_dmask_0110_image_sample_l_1d_v2f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.l.1d.f32.f32(i32 4, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_dmask_0110_image_sample_l_1d_v2f32_f32(float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt1_dmask_0110_image_sample_l_1d_v2f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.l.1d.f32.f32(i32 4, float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32(i32 6, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <2 x float> %data, i32 1 ret float %elt0 @@ -2904,10 +3194,11 @@ declare <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32(i32, float, float, ; llvm.amdgcn.image.sample.b ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt1_dmask_1001_image_sample_b_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.b.1d.f32.f32.f32(i32 8, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt1_dmask_1001_image_sample_b_1d_v4f32_f32_f32(float %bias, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt1_dmask_1001_image_sample_b_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.1d.f32.f32.f32(i32 8, float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 9, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 1 ret float %elt0 @@ -2919,10 +3210,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32, float, flo ; llvm.amdgcn.image.sample.b.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.b.cl.1d.v2f32.f32.f32(i32 12, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32(float %bias, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.b.cl.1d.v2f32.f32.f32(i32 12, float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 13, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf @@ -2934,10 +3226,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32, float, ; llvm.amdgcn.image.sample.lz ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32( -; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.lz.1d.v2f32.f32(i32 10, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret <2 x float> %data define amdgpu_ps <2 x float> @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.1d.v2f32.f32(i32 10, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret <2 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> ret <2 x float> %shuf @@ -2949,10 +3242,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32, float, <8 x i ; llvm.amdgcn.image.sample.cd ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.image.sample.cd.1d.v3f32.f32.f32(i32 14, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret <3 x float> %data define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32(float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.cd.1d.v3f32.f32.f32(i32 14, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret <3 x float> [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> ret <3 x float> %shuf @@ -2964,67 +3258,74 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, fl ; llvm.amdgcn.image.sample.cd.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32( -; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 8, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 8, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x half> %data, i32 3 ret half %elt0 } -; CHECK-LABEL: @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32( -; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 4, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 4, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x half> %data, i32 2 ret half %elt0 } -; CHECK-LABEL: @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32( -; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 2, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 2, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x half> %data, i32 1 ret half %elt0 } -; CHECK-LABEL: @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32( -; CHECK-NEXT: %data = call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32(i32 7, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: %res = shufflevector <3 x half> %data, <3 x half> poison, <4 x i32> -; CHECK-NEXT: ret <4 x half> %res define amdgpu_ps <4 x half> @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32(i32 7, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[RES:%.*]] = shufflevector <3 x half> [[DATA]], <3 x half> poison, <4 x i32> +; CHECK-NEXT: ret <4 x half> [[RES]] +; %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> ret <4 x half> %res } -; CHECK-LABEL: @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32( -; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v2f16.f32.f32(i32 3, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: %res = shufflevector <2 x half> %data, <2 x half> poison, <4 x i32> -; CHECK-NEXT: ret <4 x half> %res define amdgpu_ps <4 x half> @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v2f16.f32.f32(i32 3, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[RES:%.*]] = shufflevector <2 x half> [[DATA]], <2 x half> poison, <4 x i32> +; CHECK-NEXT: ret <4 x half> [[RES]] +; %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> ret <4 x half> %res } -; CHECK-LABEL: @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32( -; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: %res = insertelement <4 x half> poison, half %data, i64 0 -; CHECK-NEXT: ret <4 x half> %res define amdgpu_ps <4 x half> @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[RES:%.*]] = insertelement <4 x half> poison, half [[DATA]], i64 0 +; CHECK-NEXT: ret <4 x half> [[RES]] +; %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %res = shufflevector <4 x half> %data, <4 x half> poison, <4 x i32> ret <4 x half> %res } -; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32( -; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret half %data define amdgpu_ps half @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret half [[DATA]] +; %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x half> %data, i32 0 ret half %elt0 @@ -3036,10 +3337,11 @@ declare <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32, float, ; llvm.amdgcn.image.sample.c ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.1d.f32.f32(i32 1, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_1d_v4f32_f32(float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3051,10 +3353,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float, ; llvm.amdgcn.image.sample.c.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_cl_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cl.1d.f32.f32(i32 1, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_cl_1d_v4f32_f32(float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_cl_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cl.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3066,10 +3369,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32, float, floa ; llvm.amdgcn.image.sample.c.d ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_d_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_d_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_d_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3081,10 +3385,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32, float, f ; llvm.amdgcn.image.sample.c.d.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.cl.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_d_cl_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.cl.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3096,10 +3401,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32, float ; llvm.amdgcn.image.sample.c.l ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_l_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.l.1d.f32.f32(i32 1, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_l_1d_v4f32_f32(float %zcompare, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_l_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.l.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3111,10 +3417,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32, float, float ; llvm.amdgcn.image.sample.c.b ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_b_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.1d.f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_b_1d_v4f32_f32_f32(float %bias, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_b_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.1d.f32.f32.f32(i32 1, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3126,10 +3433,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32, float, f ; llvm.amdgcn.image.sample.c.b.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.cl.1d.f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_b_cl_1d_v4f32_f32_f32(float %bias, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.cl.1d.f32.f32.f32(i32 1, float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3141,10 +3449,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32, float ; llvm.amdgcn.image.sample.c.lz ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_lz_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.lz.1d.f32.f32(i32 1, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_lz_1d_v4f32_f32(float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_lz_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.lz.1d.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3156,10 +3465,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, floa ; llvm.amdgcn.image.sample.c.cd ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_cd_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_cd_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_cd_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3171,10 +3481,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32, float, ; llvm.amdgcn.image.sample.c.cd.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.cl.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_cd_cl_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.cl.1d.f32.f32.f32(i32 1, float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3186,10 +3497,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32, floa ; llvm.amdgcn.image.sample.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_o_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.o.1d.f32.f32(i32 1, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_o_1d_v4f32_f32(i32 %offset, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_o_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3201,10 +3513,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32, i32, float, <8 ; llvm.amdgcn.image.sample.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_cl_o_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cl.o.1d.f32.f32(i32 1, i32 %offset, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_cl_o_1d_v4f32_f32(i32 %offset, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_cl_o_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cl.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3216,10 +3529,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32, i32, float, ; llvm.amdgcn.image.sample.d.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_d_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_d_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_d_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3231,10 +3545,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32, i32, flo ; llvm.amdgcn.image.sample.d.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_d_cl_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_d_cl_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_d_cl_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.d.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3246,10 +3561,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32, i32, ; llvm.amdgcn.image.sample.l.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_l_o_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.l.o.1d.f32.f32(i32 1, i32 %offset, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_l_o_1d_v4f32_f32(i32 %offset, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_l_o_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.l.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3261,10 +3577,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32, i32, float, ; llvm.amdgcn.image.sample.b.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_b_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.b.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_b_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_b_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3276,10 +3593,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32, i32, flo ; llvm.amdgcn.image.sample.b.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_b_cl_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.b.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_b_cl_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_b_cl_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.b.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3291,10 +3609,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32, i32, ; llvm.amdgcn.image.sample.lz.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_lz_o_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.lz.o.1d.f32.f32(i32 1, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_lz_o_1d_v4f32_f32(i32 %offset, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_lz_o_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.lz.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3306,10 +3625,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32, i32, float, ; llvm.amdgcn.image.sample.cd.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_cd_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cd.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_cd_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_cd_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cd.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3321,10 +3641,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32, i32, fl ; llvm.amdgcn.image.sample.cd.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cd.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_cd_cl_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.cd.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3336,10 +3657,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32, i32, ; llvm.amdgcn.image.sample.c.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_o_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_o_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3351,10 +3673,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32, i32, float, ; llvm.amdgcn.image.sample.c.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_cl_o_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cl.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_cl_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_cl_o_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cl.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3366,10 +3689,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32, i32, floa ; llvm.amdgcn.image.sample.c.d.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_d_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_d_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_d_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3381,10 +3705,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32, i32, f ; llvm.amdgcn.image.sample.c.d.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_d_cl_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.d.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3396,10 +3721,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32, i32 ; llvm.amdgcn.image.sample.c.l.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_l_o_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.l.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_l_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_l_o_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.l.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3411,10 +3737,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32, i32, float ; llvm.amdgcn.image.sample.c.b.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_b_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_b_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_b_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3426,10 +3753,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32, i32, f ; llvm.amdgcn.image.sample.c.b.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_b_cl_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.b.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3441,10 +3769,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32, i32 ; llvm.amdgcn.image.sample.c.lz.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_lz_o_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.lz.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_lz_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_lz_o_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.lz.o.1d.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3456,10 +3785,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32, i32, floa ; llvm.amdgcn.image.sample.c.cd.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_cd_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_cd_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_cd_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3471,10 +3801,11 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32, i32, ; llvm.amdgcn.image.sample.c.cd.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_o_1d_v4f32_f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_sample_c_cd_cl_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_o_1d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.c.cd.cl.o.1d.f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[DSDH:%.*]], float [[DSDV:%.*]], float [[S:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3488,9 +3819,12 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32, i3 ; Don't handle gather4* -; CHECK-LABEL: @extract_elt0_image_gather4_2d_v4f32_f32( -; CHECK: %data = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_2d_v4f32_f32(float %s, float %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3502,9 +3836,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, < ; llvm.amdgcn.image.gather4.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_cl_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 2, float %s, float %t, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_cl_2d_v4f32_f32(float %s, float %t, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_cl_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 2, float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 2, float %s, float %t, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3516,9 +3853,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32, float, float ; llvm.amdgcn.image.gather4.l ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_l_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 4, float %s, float %t, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_l_2d_v4f32_f32(float %s, float %t, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_l_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 4, float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 4, float %s, float %t, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3530,9 +3870,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32, float, float, ; llvm.amdgcn.image.gather4.b ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_b_2darray_v4f32_f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 8, float %bias, float %s, float %t, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_b_2darray_v4f32_f32_f32(float %bias, float %s, float %t, float %slice, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_b_2darray_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 8, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[SLICE:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 8, float %bias, float %s, float %t, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3544,9 +3887,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32, floa ; llvm.amdgcn.image.gather4.b.cl ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_cube_v4f32_f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_b_cl_cube_v4f32_f32_f32(float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_cube_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32 1, float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[FACE:%.*]], float [[CLAMP:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3558,9 +3904,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32, floa ; llvm.amdgcn.image.gather4.lz ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_lz_2d_v4f32_f16( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_lz_2d_v4f32_f16(half %s, half %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_lz_2d_v4f32_f16( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3572,9 +3921,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32, half, half, ; llvm.amdgcn.image.gather4.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_o_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_o_2d_v4f32_f32(i32 %offset, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_o_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3586,9 +3938,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32, i32, float, f ; llvm.amdgcn.image.gather4.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_cl_o_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_cl_o_2d_v4f32_f32(i32 %offset, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_cl_o_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3600,9 +3955,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32, i32, float ; llvm.amdgcn.image.gather4.l.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_l_o_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_l_o_2d_v4f32_f32(i32 %offset, float %s, float %t, float %lod, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_l_o_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3614,9 +3972,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32, i32, float, ; llvm.amdgcn.image.gather4.b.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_b_o_2d_v4f32_f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_b_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_b_o_2d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3628,9 +3989,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32, i32, fl ; llvm.amdgcn.image.gather4.b.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_o_2d_v4f32_f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_b_cl_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_o_2d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3642,9 +4006,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32, i32, ; llvm.amdgcn.image.gather4.lz.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_lz_o_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_lz_o_2d_v4f32_f32(i32 %offset, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_lz_o_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3656,9 +4023,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32, i32, float ; llvm.amdgcn.image.gather4.c.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_c_o_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_c_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_c_o_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3670,9 +4040,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32, i32, float, ; llvm.amdgcn.image.gather4.c.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_c_cl_o_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_c_cl_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_c_cl_o_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3684,9 +4057,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32, i32, flo ; llvm.amdgcn.image.gather4.c.l.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_c_l_o_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_c_l_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_c_l_o_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3698,9 +4074,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32, i32, floa ; llvm.amdgcn.image.gather4.c.b.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_c_b_o_2d_v4f32_f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_c_b_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_c_b_o_2d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3712,9 +4091,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32, i32, ; llvm.amdgcn.image.gather4.c.b.cl.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_c_b_cl_o_2d_v4f32_f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_c_b_cl_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_c_b_cl_o_2d_v4f32_f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[BIAS:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], float [[CLAMP:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3726,9 +4108,12 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32, i3 ; llvm.amdgcn.image.gather4.c.lz.o ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_gather4_c_lz_o_2d_v4f32_f32( -; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) define amdgpu_ps float @extract_elt0_image_gather4_c_lz_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_gather4_c_lz_o_2d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], float [[S:%.*]], float [[T:%.*]], <8 x i32> [[GATHER4R:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3740,10 +4125,11 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32, i32, flo ; llvm.amdgcn.image.getlod ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_getlod_1d_v4f32_f32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.getlod.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_getlod_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_image_getlod_1d_v4f32_f32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.getlod.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3755,10 +4141,11 @@ declare <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32, float, <8 x i32> ; llvm.amdgcn.image.load ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_load_2dmsaa_v4f32_i32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 %sample, <8 x i32> %sampler, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_load_2dmsaa_v4f32_i32(i32 %s, i32 %t, i32 %sample, <8 x i32> inreg %sampler) #0 { +; CHECK-LABEL: @extract_elt0_image_load_2dmsaa_v4f32_i32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SAMPLE:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %sample, <8 x i32> %sampler, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3770,10 +4157,11 @@ declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, ; llvm.amdgcn.image.load.mip ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_load_mip_1d_v4f32_i32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.load.mip.1d.f32.i32(i32 1, i32 %s, i32 %mip, <8 x i32> %sampler, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_load_mip_1d_v4f32_i32(i32 %s, i32 %mip, <8 x i32> inreg %sampler) #0 { +; CHECK-LABEL: @extract_elt0_image_load_mip_1d_v4f32_i32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.load.mip.1d.f32.i32(i32 1, i32 [[S:%.*]], i32 [[MIP:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %sampler, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3785,10 +4173,11 @@ declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x ; llvm.amdgcn.image.getresinfo ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_image_getresinfo_1d_v4f32_i32( -; CHECK-NEXT: %data = call float @llvm.amdgcn.image.getresinfo.1d.f32.i32(i32 1, i32 %mip, <8 x i32> %sampler, i32 0, i32 0) -; CHECK-NEXT: ret float %data define amdgpu_ps float @extract_elt0_image_getresinfo_1d_v4f32_i32(i32 %mip, <8 x i32> inreg %sampler) #0 { +; CHECK-LABEL: @extract_elt0_image_getresinfo_1d_v4f32_i32( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.getresinfo.1d.f32.i32(i32 1, i32 [[MIP:%.*]], <8 x i32> [[SAMPLER:%.*]], i32 0, i32 0) +; CHECK-NEXT: ret float [[DATA]] +; %data = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %sampler, i32 0, i32 0) %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 @@ -3800,9 +4189,13 @@ declare <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32, i32, <8 x i3 ; TFE / LWE ; -------------------------------------------------------------------- -; CHECK-LABEL: @extract_elt0_tfe_image_load_1d_v4f32i32_i32( -; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1) define amdgpu_ps float @extract_elt0_tfe_image_load_1d_v4f32i32_i32(i32 %s, <8 x i32> inreg %rsrc) #0 { +; CHECK-LABEL: @extract_elt0_tfe_image_load_1d_v4f32i32_i32( +; CHECK-NEXT: [[DATA:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 1) +; CHECK-NEXT: [[RGBA:%.*]] = extractvalue { <4 x float>, i32 } [[DATA]], 0 +; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[RGBA]], i32 0 +; CHECK-NEXT: ret float [[ELT0]] +; %data = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1) %rgba = extractvalue { <4 x float>, i32 } %data, 0 %elt0 = extractelement <4 x float> %rgba, i32 0 @@ -3811,10 +4204,11 @@ define amdgpu_ps float @extract_elt0_tfe_image_load_1d_v4f32i32_i32(i32 %s, <8 x declare {<4 x float>, i32} @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32, i32, <8 x i32>, i32, i32) #1 -; CHECK: @tfe_check_assert( -; CHECK: %data = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1) -; CHECK-NEXT: ret float %data define amdgpu_hs float @tfe_check_assert() #0 { +; CHECK-LABEL: @tfe_check_assert( +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1) +; CHECK-NEXT: ret float [[DATA]] +; %data = call nsz <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1) #2 %elt0 = extractelement <4 x float> %data, i32 0 ret float %elt0 diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll index a4c84905b5dacc..071f322b45def7 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll @@ -5,6 +5,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_add_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -20,6 +21,7 @@ define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_add_ss_round(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_add_ss_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -32,6 +34,7 @@ define <4 x float> @test_add_ss_round(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_add_ss_mask( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -51,6 +54,7 @@ define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> } define <4 x float> @test_add_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_add_ss_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -63,6 +67,7 @@ define <4 x float> @test_add_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x } define float @test_add_ss_1(float %a, float %b) { +; ; CHECK-LABEL: @test_add_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -82,6 +87,7 @@ define float @test_add_ss_1(float %a, float %b) { declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_add_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -95,6 +101,7 @@ define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_add_sd_round(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_add_sd_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -105,6 +112,7 @@ define <2 x double> @test_add_sd_round(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_add_sd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -122,6 +130,7 @@ define <2 x double> @test_add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou } define <2 x double> @test_add_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_add_sd_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -132,6 +141,7 @@ define <2 x double> @test_add_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 } define double @test_add_sd_1(double %a, double %b) { +; ; CHECK-LABEL: @test_add_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -147,6 +157,7 @@ define double @test_add_sd_1(double %a, double %b) { declare <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_sub_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -162,6 +173,7 @@ define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_sub_ss_round(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_sub_ss_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -174,6 +186,7 @@ define <4 x float> @test_sub_ss_round(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_sub_ss_mask( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -193,6 +206,7 @@ define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> } define <4 x float> @test_sub_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_sub_ss_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -205,6 +219,7 @@ define <4 x float> @test_sub_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x } define float @test_sub_ss_1(float %a, float %b) { +; ; CHECK-LABEL: @test_sub_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -224,6 +239,7 @@ define float @test_sub_ss_1(float %a, float %b) { declare <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_sub_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -237,6 +253,7 @@ define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_sub_sd_round(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_sub_sd_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -247,6 +264,7 @@ define <2 x double> @test_sub_sd_round(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_sub_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_sub_sd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -264,6 +282,7 @@ define <2 x double> @test_sub_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou } define <2 x double> @test_sub_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_sub_sd_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -274,6 +293,7 @@ define <2 x double> @test_sub_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 } define double @test_sub_sd_1(double %a, double %b) { +; ; CHECK-LABEL: @test_sub_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -289,6 +309,7 @@ define double @test_sub_sd_1(double %a, double %b) { declare <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_mul_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -304,6 +325,7 @@ define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_mul_ss_round(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_mul_ss_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -316,6 +338,7 @@ define <4 x float> @test_mul_ss_round(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mul_ss_mask( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -335,6 +358,7 @@ define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> } define <4 x float> @test_mul_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mul_ss_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -347,6 +371,7 @@ define <4 x float> @test_mul_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x } define float @test_mul_ss_1(float %a, float %b) { +; ; CHECK-LABEL: @test_mul_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -366,6 +391,7 @@ define float @test_mul_ss_1(float %a, float %b) { declare <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_mul_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -379,6 +405,7 @@ define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_mul_sd_round(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_mul_sd_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -389,6 +416,7 @@ define <2 x double> @test_mul_sd_round(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_mul_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mul_sd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -406,6 +434,7 @@ define <2 x double> @test_mul_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou } define <2 x double> @test_mul_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mul_sd_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -416,6 +445,7 @@ define <2 x double> @test_mul_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 } define double @test_mul_sd_1(double %a, double %b) { +; ; CHECK-LABEL: @test_mul_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -431,6 +461,7 @@ define double @test_mul_sd_1(double %a, double %b) { declare <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_div_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -446,6 +477,7 @@ define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_div_ss_round(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_div_ss_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -458,6 +490,7 @@ define <4 x float> @test_div_ss_round(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_div_ss_mask( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -477,6 +510,7 @@ define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> } define <4 x float> @test_div_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_div_ss_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -489,6 +523,7 @@ define <4 x float> @test_div_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x } define float @test_div_ss_1(float %a, float %b) { +; ; CHECK-LABEL: @test_div_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -508,6 +543,7 @@ define float @test_div_ss_1(float %a, float %b) { declare <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_div_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -521,6 +557,7 @@ define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_div_sd_round(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_div_sd_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -531,6 +568,7 @@ define <2 x double> @test_div_sd_round(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_div_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_div_sd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -548,6 +586,7 @@ define <2 x double> @test_div_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou } define <2 x double> @test_div_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_div_sd_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -558,6 +597,7 @@ define <2 x double> @test_div_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 } define double @test_div_sd_1(double %a, double %b) { +; ; CHECK-LABEL: @test_div_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -573,6 +613,7 @@ define double @test_div_sd_1(double %a, double %b) { declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_max_ss( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 4) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -585,6 +626,7 @@ define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_max_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_max_ss_mask( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -597,6 +639,7 @@ define <4 x float> @test_max_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> } define float @test_max_ss_1(float %a, float %b) { +; ; CHECK-LABEL: @test_max_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -616,6 +659,7 @@ define float @test_max_ss_1(float %a, float %b) { declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_max_sd( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 4) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -626,6 +670,7 @@ define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_max_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_max_sd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -636,6 +681,7 @@ define <2 x double> @test_max_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou } define double @test_max_sd_1(double %a, double %b) { +; ; CHECK-LABEL: @test_max_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -651,6 +697,7 @@ define double @test_max_sd_1(double %a, double %b) { declare <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_min_ss( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 4) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -663,6 +710,7 @@ define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_min_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_min_ss_mask( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -675,6 +723,7 @@ define <4 x float> @test_min_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> } define float @test_min_ss_1(float %a, float %b) { +; ; CHECK-LABEL: @test_min_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -694,6 +743,7 @@ define float @test_min_ss_1(float %a, float %b) { declare <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_min_sd( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 4) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -704,6 +754,7 @@ define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_min_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_min_sd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -714,6 +765,7 @@ define <2 x double> @test_min_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou } define double @test_min_sd_1(double %a, double %b) { +; ; CHECK-LABEL: @test_min_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -729,6 +781,7 @@ define double @test_min_sd_1(double %a, double %b) { declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32) define i8 @test_cmp_ss(<4 x float> %a, <4 x float> %b, i8 %mask) { +; ; CHECK-LABEL: @test_cmp_ss( ; CHECK-NEXT: [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 3, i8 [[MASK:%.*]], i32 4) ; CHECK-NEXT: ret i8 [[TMP1]] @@ -746,6 +799,7 @@ define i8 @test_cmp_ss(<4 x float> %a, <4 x float> %b, i8 %mask) { declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32) define i8 @test_cmp_sd(<2 x double> %a, <2 x double> %b, i8 %mask) { +; ; CHECK-LABEL: @test_cmp_sd( ; CHECK-NEXT: [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 3, i8 [[MASK:%.*]], i32 4) ; CHECK-NEXT: ret i8 [[TMP1]] @@ -757,6 +811,7 @@ define i8 @test_cmp_sd(<2 x double> %a, <2 x double> %b, i8 %mask) { } define i64 @test(float %f, double %d) { +; ; CHECK-LABEL: @test( ; CHECK-NEXT: [[V03:%.*]] = insertelement <4 x float> poison, float [[F:%.*]], i32 0 ; CHECK-NEXT: [[T0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[V03]], i32 4) @@ -837,6 +892,7 @@ declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) declare i64 @llvm.x86.avx512.cvttsd2si64(<2 x double>, i32) define i64 @test2(float %f, double %d) { +; ; CHECK-LABEL: @test2( ; CHECK-NEXT: [[V03:%.*]] = insertelement <4 x float> poison, float [[F:%.*]], i32 0 ; CHECK-NEXT: [[T0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[V03]], i32 4) @@ -919,6 +975,7 @@ declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32) declare float @llvm.fma.f32(float, float, float) #1 define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask_vfmadd_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 @@ -948,6 +1005,7 @@ define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x flo } define float @test_mask_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask_vfmadd_ss_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -974,6 +1032,7 @@ define float @test_mask_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> } define float @test_mask_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask_vfmadd_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -995,6 +1054,7 @@ define float @test_mask_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> declare double @llvm.fma.f64(double, double, double) #1 define <2 x double> @test_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask_vfmadd_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -1020,6 +1080,7 @@ define <2 x double> @test_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x } define double @test_mask_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask_vfmadd_sd_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -1044,6 +1105,7 @@ define double @test_mask_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x doub } define double @test_mask_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask_vfmadd_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -1061,6 +1123,7 @@ define double @test_mask_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x doub } define <4 x float> @test_maskz_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_maskz_vfmadd_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 @@ -1090,6 +1153,7 @@ define <4 x float> @test_maskz_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x fl } define float @test_maskz_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_maskz_vfmadd_ss_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -1116,6 +1180,7 @@ define float @test_maskz_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> } define float @test_maskz_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_maskz_vfmadd_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -1135,6 +1200,7 @@ define float @test_maskz_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> } define <2 x double> @test_maskz_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_maskz_vfmadd_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -1160,6 +1226,7 @@ define <2 x double> @test_maskz_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x } define double @test_maskz_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_maskz_vfmadd_sd_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -1184,6 +1251,7 @@ define double @test_maskz_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x dou } define double @test_maskz_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_maskz_vfmadd_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -1201,6 +1269,7 @@ define double @test_maskz_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x dou } define <4 x float> @test_mask3_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmadd_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 @@ -1230,6 +1299,7 @@ define <4 x float> @test_mask3_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x fl } define float @test_mask3_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmadd_ss_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -1256,6 +1326,7 @@ define float @test_mask3_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> } define float @test_mask3_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmadd_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -1275,6 +1346,7 @@ define float @test_mask3_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> } define <2 x double> @test_mask3_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmadd_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -1300,6 +1372,7 @@ define <2 x double> @test_mask3_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x } define double @test_mask3_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmadd_sd_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -1324,6 +1397,7 @@ define double @test_mask3_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x dou } define double @test_mask3_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmadd_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -1341,6 +1415,7 @@ define double @test_mask3_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x dou } define <4 x float> @test_mask3_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmsub_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 @@ -1374,6 +1449,7 @@ define <4 x float> @test_mask3_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x fl } define float @test_mask3_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmsub_ss_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -1404,6 +1480,7 @@ define float @test_mask3_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> } define float @test_mask3_vfmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmsub_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -1425,6 +1502,7 @@ define float @test_mask3_vfmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> } define float @test_mask3_vfmsub_ss_1_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmsub_ss_1_unary_fneg( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -1446,6 +1524,7 @@ define float @test_mask3_vfmsub_ss_1_unary_fneg(<4 x float> %a, <4 x float> %b, } define <2 x double> @test_mask3_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmsub_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -1475,6 +1554,7 @@ define <2 x double> @test_mask3_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x } define double @test_mask3_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmsub_sd_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -1503,6 +1583,7 @@ define double @test_mask3_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x dou } define double @test_mask3_vfmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmsub_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -1522,6 +1603,7 @@ define double @test_mask3_vfmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x dou } define double @test_mask3_vfmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmsub_sd_1_unary_fneg( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -1541,6 +1623,7 @@ define double @test_mask3_vfmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double> % } define <4 x float> @test_mask3_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfnmsub_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = fneg float [[TMP1]] @@ -1576,6 +1659,7 @@ define <4 x float> @test_mask3_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x f } define float @test_mask3_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfnmsub_ss_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = fneg float [[TMP1]] @@ -1608,6 +1692,7 @@ define float @test_mask3_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float } define float @test_mask3_vfnmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfnmsub_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -1630,6 +1715,7 @@ define float @test_mask3_vfnmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float } define float @test_mask3_vfnmsub_ss_1_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfnmsub_ss_1_unary_fneg( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -1652,6 +1738,7 @@ define float @test_mask3_vfnmsub_ss_1_unary_fneg(<4 x float> %a, <4 x float> %b, } define <2 x double> @test_mask3_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfnmsub_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = fneg double [[TMP1]] @@ -1683,6 +1770,7 @@ define <2 x double> @test_mask3_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2 } define double @test_mask3_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfnmsub_sd_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = fneg double [[TMP1]] @@ -1713,6 +1801,7 @@ define double @test_mask3_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x do } define double @test_mask3_vfnmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfnmsub_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -1733,6 +1822,7 @@ define double @test_mask3_vfnmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x do } define double @test_mask3_vfnmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfnmsub_sd_1_unary_fneg( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -1755,6 +1845,7 @@ define double @test_mask3_vfnmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double> declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_si_256( ; CHECK-NEXT: ret <8 x i32> [[A0:%.*]] ; @@ -1763,6 +1854,7 @@ define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) { } define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_si_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[A0:%.*]], <8 x i32> [[PASSTHRU:%.*]] @@ -1775,6 +1867,7 @@ define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %pa } define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_si_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: ret <8 x i32> [[TMP1]] @@ -1784,6 +1877,7 @@ define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) { } define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_si_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -1797,6 +1891,7 @@ define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passth } define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_si_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] @@ -1806,6 +1901,7 @@ define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) { } define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_si_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -1819,6 +1915,7 @@ define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %pas } define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_si_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] @@ -1828,6 +1925,7 @@ define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) { } define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_si_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -1843,6 +1941,7 @@ define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passt declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_sf_256( ; CHECK-NEXT: ret <8 x float> [[A0:%.*]] ; @@ -1851,6 +1950,7 @@ define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) { } define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_sf_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[A0:%.*]], <8 x float> [[PASSTHRU:%.*]] @@ -1863,6 +1963,7 @@ define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x floa } define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_sf_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: ret <8 x float> [[TMP1]] @@ -1872,6 +1973,7 @@ define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) { } define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_sf_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -1885,6 +1987,7 @@ define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> % } define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_sf_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> ; CHECK-NEXT: ret <8 x float> [[TMP1]] @@ -1894,6 +1997,7 @@ define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) { } define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_sf_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -1907,6 +2011,7 @@ define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float } define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_sf_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> ; CHECK-NEXT: ret <8 x float> [[TMP1]] @@ -1916,6 +2021,7 @@ define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) { } define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_sf_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -1931,6 +2037,7 @@ define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>) define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_di_256( ; CHECK-NEXT: ret <4 x i64> [[A0:%.*]] ; @@ -1939,6 +2046,7 @@ define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) { } define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_di_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> @@ -1953,6 +2061,7 @@ define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %pa } define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_di_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: ret <4 x i64> [[TMP1]] @@ -1962,6 +2071,7 @@ define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) { } define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_di_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -1977,6 +2087,7 @@ define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passth } define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_di_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i64> [[TMP1]] @@ -1986,6 +2097,7 @@ define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) { } define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_di_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2001,6 +2113,7 @@ define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %pas } define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_di_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i64> [[TMP1]] @@ -2010,6 +2123,7 @@ define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) { } define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_di_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2027,6 +2141,7 @@ define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passt declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>) define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_df_256( ; CHECK-NEXT: ret <4 x double> [[A0:%.*]] ; @@ -2035,6 +2150,7 @@ define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) { } define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_df_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> @@ -2049,6 +2165,7 @@ define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x do } define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_df_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: ret <4 x double> [[TMP1]] @@ -2058,6 +2175,7 @@ define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) { } define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_df_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2073,6 +2191,7 @@ define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double } define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_df_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: ret <4 x double> [[TMP1]] @@ -2082,6 +2201,7 @@ define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) { } define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_df_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2097,6 +2217,7 @@ define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x dou } define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_df_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: ret <4 x double> [[TMP1]] @@ -2106,6 +2227,7 @@ define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) { } define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_df_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2123,6 +2245,7 @@ define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x doubl declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_si_512( ; CHECK-NEXT: ret <16 x i32> [[A0:%.*]] ; @@ -2131,6 +2254,7 @@ define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) { } define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_si_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[A0:%.*]], <16 x i32> [[PASSTHRU:%.*]] @@ -2143,6 +2267,7 @@ define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> } define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_si_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: ret <16 x i32> [[TMP1]] @@ -2152,6 +2277,7 @@ define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) { } define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_si_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2165,6 +2291,7 @@ define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %pas } define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_si_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[TMP1]] @@ -2174,6 +2301,7 @@ define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) { } define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_si_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2187,6 +2315,7 @@ define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> % } define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_si_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[TMP1]] @@ -2196,6 +2325,7 @@ define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) { } define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_si_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2211,6 +2341,7 @@ define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %pa declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>) define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_sf_512( ; CHECK-NEXT: ret <16 x float> [[A0:%.*]] ; @@ -2219,6 +2350,7 @@ define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) { } define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_sf_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x float> [[A0:%.*]], <16 x float> [[PASSTHRU:%.*]] @@ -2231,6 +2363,7 @@ define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x f } define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_sf_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -2240,6 +2373,7 @@ define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) { } define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_sf_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2253,6 +2387,7 @@ define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float } define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_sf_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -2262,6 +2397,7 @@ define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) { } define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_sf_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2275,6 +2411,7 @@ define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x fl } define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_sf_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -2284,6 +2421,7 @@ define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) { } define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_sf_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2299,6 +2437,7 @@ define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x floa declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_di_512( ; CHECK-NEXT: ret <8 x i64> [[A0:%.*]] ; @@ -2307,6 +2446,7 @@ define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) { } define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_di_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[A0:%.*]], <8 x i64> [[PASSTHRU:%.*]] @@ -2319,6 +2459,7 @@ define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %pa } define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_di_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: ret <8 x i64> [[TMP1]] @@ -2328,6 +2469,7 @@ define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) { } define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_di_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2341,6 +2483,7 @@ define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passth } define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_di_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i64> [[TMP1]] @@ -2350,6 +2493,7 @@ define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) { } define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_di_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2363,6 +2507,7 @@ define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %pas } define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_di_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i64> [[TMP1]] @@ -2372,6 +2517,7 @@ define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) { } define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_di_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2387,6 +2533,7 @@ define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passt declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_df_512( ; CHECK-NEXT: ret <8 x double> [[A0:%.*]] ; @@ -2395,6 +2542,7 @@ define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) { } define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_df_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x double> [[A0:%.*]], <8 x double> [[PASSTHRU:%.*]] @@ -2407,6 +2555,7 @@ define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x do } define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_df_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -2416,6 +2565,7 @@ define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) { } define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_df_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2429,6 +2579,7 @@ define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double } define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_df_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -2438,6 +2589,7 @@ define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) { } define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_df_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2451,6 +2603,7 @@ define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x dou } define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_df_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -2460,6 +2613,7 @@ define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) { } define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_df_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2475,6 +2629,7 @@ define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x doubl declare <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16>, <8 x i16>) define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_hi_128( ; CHECK-NEXT: ret <8 x i16> [[A0:%.*]] ; @@ -2483,6 +2638,7 @@ define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) { } define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_hi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A0:%.*]], <8 x i16> [[PASSTHRU:%.*]] @@ -2495,6 +2651,7 @@ define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %pa } define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_hi_128( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: ret <8 x i16> [[TMP1]] @@ -2504,6 +2661,7 @@ define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) { } define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_hi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2517,6 +2675,7 @@ define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passth } define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_hi_128( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[TMP1]] @@ -2526,6 +2685,7 @@ define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) { } define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_hi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2539,6 +2699,7 @@ define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %pas } define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_hi_128( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[TMP1]] @@ -2548,6 +2709,7 @@ define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) { } define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_hi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2563,6 +2725,7 @@ define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passt declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>) define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_hi_256( ; CHECK-NEXT: ret <16 x i16> [[A0:%.*]] ; @@ -2571,6 +2734,7 @@ define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) { } define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_hi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> [[A0:%.*]], <16 x i16> [[PASSTHRU:%.*]] @@ -2583,6 +2747,7 @@ define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> } define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_hi_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: ret <16 x i16> [[TMP1]] @@ -2592,6 +2757,7 @@ define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) { } define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_hi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2605,6 +2771,7 @@ define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %pas } define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_hi_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i16> [[TMP1]] @@ -2614,6 +2781,7 @@ define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) { } define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_hi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2627,6 +2795,7 @@ define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> % } define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_hi_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i16> [[TMP1]] @@ -2636,6 +2805,7 @@ define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) { } define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_hi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2651,6 +2821,7 @@ define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %pa declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>) define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_hi_512( ; CHECK-NEXT: ret <32 x i16> [[A0:%.*]] ; @@ -2659,6 +2830,7 @@ define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) { } define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_hi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i16> [[A0:%.*]], <32 x i16> [[PASSTHRU:%.*]] @@ -2671,6 +2843,7 @@ define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> } define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_hi_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: ret <32 x i16> [[TMP1]] @@ -2680,6 +2853,7 @@ define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) { } define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_hi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> @@ -2693,6 +2867,7 @@ define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %pas } define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_hi_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> ; CHECK-NEXT: ret <32 x i16> [[TMP1]] @@ -2702,6 +2877,7 @@ define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) { } define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_hi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> @@ -2715,6 +2891,7 @@ define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> % } define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_hi_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> ; CHECK-NEXT: ret <32 x i16> [[TMP1]] @@ -2724,6 +2901,7 @@ define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) { } define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_hi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> @@ -2739,6 +2917,7 @@ define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %pa declare <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8>, <16 x i8>) define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_qi_128( ; CHECK-NEXT: ret <16 x i8> [[A0:%.*]] ; @@ -2747,6 +2926,7 @@ define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) { } define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_qi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A0:%.*]], <16 x i8> [[PASSTHRU:%.*]] @@ -2759,6 +2939,7 @@ define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %pa } define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_qi_128( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: ret <16 x i8> [[TMP1]] @@ -2768,6 +2949,7 @@ define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) { } define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_qi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2781,6 +2963,7 @@ define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passth } define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_qi_128( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i8> [[TMP1]] @@ -2790,6 +2973,7 @@ define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) { } define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_qi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2803,6 +2987,7 @@ define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %pas } define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_qi_128( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i8> [[TMP1]] @@ -2812,6 +2997,7 @@ define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) { } define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_qi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2827,6 +3013,7 @@ define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passt declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>) define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_qi_256( ; CHECK-NEXT: ret <32 x i8> [[A0:%.*]] ; @@ -2835,6 +3022,7 @@ define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) { } define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_qi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[A0:%.*]], <32 x i8> [[PASSTHRU:%.*]] @@ -2847,6 +3035,7 @@ define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %pa } define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_qi_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: ret <32 x i8> [[TMP1]] @@ -2856,6 +3045,7 @@ define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) { } define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_qi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> @@ -2869,6 +3059,7 @@ define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passth } define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_qi_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> ; CHECK-NEXT: ret <32 x i8> [[TMP1]] @@ -2878,6 +3069,7 @@ define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) { } define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_qi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> @@ -2891,6 +3083,7 @@ define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %pas } define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_qi_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> ; CHECK-NEXT: ret <32 x i8> [[TMP1]] @@ -2900,6 +3093,7 @@ define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) { } define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_qi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> @@ -2915,6 +3109,7 @@ define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passt declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_qi_512( ; CHECK-NEXT: ret <64 x i8> [[A0:%.*]] ; @@ -2923,6 +3118,7 @@ define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) { } define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_qi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <64 x i1> [[TMP1]], <64 x i8> [[A0:%.*]], <64 x i8> [[PASSTHRU:%.*]] @@ -2935,6 +3131,7 @@ define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %pa } define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_qi_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer ; CHECK-NEXT: ret <64 x i8> [[TMP1]] @@ -2944,6 +3141,7 @@ define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) { } define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_qi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> @@ -2957,6 +3155,7 @@ define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passth } define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_qi_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> ; CHECK-NEXT: ret <64 x i8> [[TMP1]] @@ -2966,6 +3165,7 @@ define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) { } define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_qi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> @@ -2979,6 +3179,7 @@ define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %pas } define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_qi_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> ; CHECK-NEXT: ret <64 x i8> [[TMP1]] @@ -2988,6 +3189,7 @@ define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) { } define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_qi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> @@ -3003,6 +3205,7 @@ define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passt declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32) define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) { +; ; CHECK-LABEL: @test_add_ps( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -3012,6 +3215,7 @@ define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) { } define <16 x float> @test_add_ps_round(<16 x float> %a, <16 x float> %b) { +; ; CHECK-LABEL: @test_add_ps_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -3021,6 +3225,7 @@ define <16 x float> @test_add_ps_round(<16 x float> %a, <16 x float> %b) { } define <16 x float> @test_add_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; ; CHECK-LABEL: @test_add_ps_mask( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -3034,6 +3239,7 @@ define <16 x float> @test_add_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl } define <16 x float> @test_add_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; ; CHECK-LABEL: @test_add_ps_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -3049,6 +3255,7 @@ define <16 x float> @test_add_ps_mask_round(<16 x float> %a, <16 x float> %b, <1 declare <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double>, <8 x double>, i32) define <8 x double> @test_add_pd(<8 x double> %a, <8 x double> %b) { +; ; CHECK-LABEL: @test_add_pd( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -3058,6 +3265,7 @@ define <8 x double> @test_add_pd(<8 x double> %a, <8 x double> %b) { } define <8 x double> @test_add_pd_round(<8 x double> %a, <8 x double> %b) { +; ; CHECK-LABEL: @test_add_pd_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -3067,6 +3275,7 @@ define <8 x double> @test_add_pd_round(<8 x double> %a, <8 x double> %b) { } define <8 x double> @test_add_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_add_pd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -3080,6 +3289,7 @@ define <8 x double> @test_add_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou } define <8 x double> @test_add_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_add_pd_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -3095,6 +3305,7 @@ define <8 x double> @test_add_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32) define <16 x float> @test_sub_ps(<16 x float> %a, <16 x float> %b) { +; ; CHECK-LABEL: @test_sub_ps( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -3104,6 +3315,7 @@ define <16 x float> @test_sub_ps(<16 x float> %a, <16 x float> %b) { } define <16 x float> @test_sub_ps_round(<16 x float> %a, <16 x float> %b) { +; ; CHECK-LABEL: @test_sub_ps_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -3113,6 +3325,7 @@ define <16 x float> @test_sub_ps_round(<16 x float> %a, <16 x float> %b) { } define <16 x float> @test_sub_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; ; CHECK-LABEL: @test_sub_ps_mask( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -3126,6 +3339,7 @@ define <16 x float> @test_sub_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl } define <16 x float> @test_sub_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; ; CHECK-LABEL: @test_sub_ps_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -3141,6 +3355,7 @@ define <16 x float> @test_sub_ps_mask_round(<16 x float> %a, <16 x float> %b, <1 declare <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double>, <8 x double>, i32) define <8 x double> @test_sub_pd(<8 x double> %a, <8 x double> %b) { +; ; CHECK-LABEL: @test_sub_pd( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -3150,6 +3365,7 @@ define <8 x double> @test_sub_pd(<8 x double> %a, <8 x double> %b) { } define <8 x double> @test_sub_pd_round(<8 x double> %a, <8 x double> %b) { +; ; CHECK-LABEL: @test_sub_pd_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -3159,6 +3375,7 @@ define <8 x double> @test_sub_pd_round(<8 x double> %a, <8 x double> %b) { } define <8 x double> @test_sub_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_sub_pd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -3172,6 +3389,7 @@ define <8 x double> @test_sub_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou } define <8 x double> @test_sub_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_sub_pd_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -3187,6 +3405,7 @@ define <8 x double> @test_sub_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32) define <16 x float> @test_mul_ps(<16 x float> %a, <16 x float> %b) { +; ; CHECK-LABEL: @test_mul_ps( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -3196,6 +3415,7 @@ define <16 x float> @test_mul_ps(<16 x float> %a, <16 x float> %b) { } define <16 x float> @test_mul_ps_round(<16 x float> %a, <16 x float> %b) { +; ; CHECK-LABEL: @test_mul_ps_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -3205,6 +3425,7 @@ define <16 x float> @test_mul_ps_round(<16 x float> %a, <16 x float> %b) { } define <16 x float> @test_mul_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; ; CHECK-LABEL: @test_mul_ps_mask( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -3218,6 +3439,7 @@ define <16 x float> @test_mul_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl } define <16 x float> @test_mul_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; ; CHECK-LABEL: @test_mul_ps_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -3233,6 +3455,7 @@ define <16 x float> @test_mul_ps_mask_round(<16 x float> %a, <16 x float> %b, <1 declare <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double>, <8 x double>, i32) define <8 x double> @test_mul_pd(<8 x double> %a, <8 x double> %b) { +; ; CHECK-LABEL: @test_mul_pd( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -3242,6 +3465,7 @@ define <8 x double> @test_mul_pd(<8 x double> %a, <8 x double> %b) { } define <8 x double> @test_mul_pd_round(<8 x double> %a, <8 x double> %b) { +; ; CHECK-LABEL: @test_mul_pd_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -3251,6 +3475,7 @@ define <8 x double> @test_mul_pd_round(<8 x double> %a, <8 x double> %b) { } define <8 x double> @test_mul_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mul_pd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -3264,6 +3489,7 @@ define <8 x double> @test_mul_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou } define <8 x double> @test_mul_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mul_pd_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -3279,6 +3505,7 @@ define <8 x double> @test_mul_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32) define <16 x float> @test_div_ps(<16 x float> %a, <16 x float> %b) { +; ; CHECK-LABEL: @test_div_ps( ; CHECK-NEXT: [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -3288,6 +3515,7 @@ define <16 x float> @test_div_ps(<16 x float> %a, <16 x float> %b) { } define <16 x float> @test_div_ps_round(<16 x float> %a, <16 x float> %b) { +; ; CHECK-LABEL: @test_div_ps_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -3297,6 +3525,7 @@ define <16 x float> @test_div_ps_round(<16 x float> %a, <16 x float> %b) { } define <16 x float> @test_div_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; ; CHECK-LABEL: @test_div_ps_mask( ; CHECK-NEXT: [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -3310,6 +3539,7 @@ define <16 x float> @test_div_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl } define <16 x float> @test_div_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; ; CHECK-LABEL: @test_div_ps_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -3325,6 +3555,7 @@ define <16 x float> @test_div_ps_mask_round(<16 x float> %a, <16 x float> %b, <1 declare <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double>, <8 x double>, i32) define <8 x double> @test_div_pd(<8 x double> %a, <8 x double> %b) { +; ; CHECK-LABEL: @test_div_pd( ; CHECK-NEXT: [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -3334,6 +3565,7 @@ define <8 x double> @test_div_pd(<8 x double> %a, <8 x double> %b) { } define <8 x double> @test_div_pd_round(<8 x double> %a, <8 x double> %b) { +; ; CHECK-LABEL: @test_div_pd_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -3343,6 +3575,7 @@ define <8 x double> @test_div_pd_round(<8 x double> %a, <8 x double> %b) { } define <8 x double> @test_div_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_div_pd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -3356,6 +3589,7 @@ define <8 x double> @test_div_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou } define <8 x double> @test_div_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_div_pd_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -3371,6 +3605,7 @@ define <8 x double> @test_div_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32) define i32 @test_comi_ss_0(float %a, float %b) { +; ; CHECK-LABEL: @test_comi_ss_0( ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i32 0 @@ -3392,6 +3627,7 @@ define i32 @test_comi_ss_0(float %a, float %b) { declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32) define i32 @test_comi_sd_0(double %a, double %b) { +; ; CHECK-LABEL: @test_comi_sd_0( ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[B:%.*]], i32 0 diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll index 7772946751c5c5..dc735e0b490851 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll @@ -5,6 +5,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_add_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -20,6 +21,7 @@ define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_add_ss_round(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_add_ss_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -32,6 +34,7 @@ define <4 x float> @test_add_ss_round(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_add_ss_mask( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -51,6 +54,7 @@ define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> } define <4 x float> @test_add_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_add_ss_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -63,6 +67,7 @@ define <4 x float> @test_add_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x } define float @test_add_ss_1(float %a, float %b) { +; ; CHECK-LABEL: @test_add_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -82,6 +87,7 @@ define float @test_add_ss_1(float %a, float %b) { declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_add_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -95,6 +101,7 @@ define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_add_sd_round(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_add_sd_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -105,6 +112,7 @@ define <2 x double> @test_add_sd_round(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_add_sd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -122,6 +130,7 @@ define <2 x double> @test_add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou } define <2 x double> @test_add_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_add_sd_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -132,6 +141,7 @@ define <2 x double> @test_add_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 } define double @test_add_sd_1(double %a, double %b) { +; ; CHECK-LABEL: @test_add_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -147,6 +157,7 @@ define double @test_add_sd_1(double %a, double %b) { declare <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_sub_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -162,6 +173,7 @@ define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_sub_ss_round(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_sub_ss_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -174,6 +186,7 @@ define <4 x float> @test_sub_ss_round(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_sub_ss_mask( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -193,6 +206,7 @@ define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> } define <4 x float> @test_sub_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_sub_ss_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -205,6 +219,7 @@ define <4 x float> @test_sub_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x } define float @test_sub_ss_1(float %a, float %b) { +; ; CHECK-LABEL: @test_sub_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -224,6 +239,7 @@ define float @test_sub_ss_1(float %a, float %b) { declare <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_sub_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -237,6 +253,7 @@ define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_sub_sd_round(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_sub_sd_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -247,6 +264,7 @@ define <2 x double> @test_sub_sd_round(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_sub_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_sub_sd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -264,6 +282,7 @@ define <2 x double> @test_sub_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou } define <2 x double> @test_sub_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_sub_sd_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -274,6 +293,7 @@ define <2 x double> @test_sub_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 } define double @test_sub_sd_1(double %a, double %b) { +; ; CHECK-LABEL: @test_sub_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -289,6 +309,7 @@ define double @test_sub_sd_1(double %a, double %b) { declare <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_mul_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -304,6 +325,7 @@ define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_mul_ss_round(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_mul_ss_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -316,6 +338,7 @@ define <4 x float> @test_mul_ss_round(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mul_ss_mask( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -335,6 +358,7 @@ define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> } define <4 x float> @test_mul_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mul_ss_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -347,6 +371,7 @@ define <4 x float> @test_mul_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x } define float @test_mul_ss_1(float %a, float %b) { +; ; CHECK-LABEL: @test_mul_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -366,6 +391,7 @@ define float @test_mul_ss_1(float %a, float %b) { declare <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_mul_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -379,6 +405,7 @@ define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_mul_sd_round(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_mul_sd_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -389,6 +416,7 @@ define <2 x double> @test_mul_sd_round(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_mul_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mul_sd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -406,6 +434,7 @@ define <2 x double> @test_mul_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou } define <2 x double> @test_mul_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mul_sd_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -416,6 +445,7 @@ define <2 x double> @test_mul_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 } define double @test_mul_sd_1(double %a, double %b) { +; ; CHECK-LABEL: @test_mul_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -431,6 +461,7 @@ define double @test_mul_sd_1(double %a, double %b) { declare <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_div_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -446,6 +477,7 @@ define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_div_ss_round(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_div_ss_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -458,6 +490,7 @@ define <4 x float> @test_div_ss_round(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_div_ss_mask( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -477,6 +510,7 @@ define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> } define <4 x float> @test_div_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_div_ss_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -489,6 +523,7 @@ define <4 x float> @test_div_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x } define float @test_div_ss_1(float %a, float %b) { +; ; CHECK-LABEL: @test_div_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -508,6 +543,7 @@ define float @test_div_ss_1(float %a, float %b) { declare <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_div_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -521,6 +557,7 @@ define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_div_sd_round(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_div_sd_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -531,6 +568,7 @@ define <2 x double> @test_div_sd_round(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_div_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_div_sd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -548,6 +586,7 @@ define <2 x double> @test_div_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou } define <2 x double> @test_div_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_div_sd_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -558,6 +597,7 @@ define <2 x double> @test_div_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 } define double @test_div_sd_1(double %a, double %b) { +; ; CHECK-LABEL: @test_div_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -573,6 +613,7 @@ define double @test_div_sd_1(double %a, double %b) { declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_max_ss( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 4) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -585,6 +626,7 @@ define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_max_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_max_ss_mask( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -597,6 +639,7 @@ define <4 x float> @test_max_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> } define float @test_max_ss_1(float %a, float %b) { +; ; CHECK-LABEL: @test_max_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -616,6 +659,7 @@ define float @test_max_ss_1(float %a, float %b) { declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_max_sd( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 4) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -626,6 +670,7 @@ define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_max_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_max_sd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -636,6 +681,7 @@ define <2 x double> @test_max_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou } define double @test_max_sd_1(double %a, double %b) { +; ; CHECK-LABEL: @test_max_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -651,6 +697,7 @@ define double @test_max_sd_1(double %a, double %b) { declare <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) { +; ; CHECK-LABEL: @test_min_ss( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 4) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -663,6 +710,7 @@ define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) { } define <4 x float> @test_min_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_min_ss_mask( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4) ; CHECK-NEXT: ret <4 x float> [[TMP1]] @@ -675,6 +723,7 @@ define <4 x float> @test_min_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> } define float @test_min_ss_1(float %a, float %b) { +; ; CHECK-LABEL: @test_min_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -694,6 +743,7 @@ define float @test_min_ss_1(float %a, float %b) { declare <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) { +; ; CHECK-LABEL: @test_min_sd( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 4) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -704,6 +754,7 @@ define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_min_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_min_sd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4) ; CHECK-NEXT: ret <2 x double> [[TMP1]] @@ -714,6 +765,7 @@ define <2 x double> @test_min_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou } define double @test_min_sd_1(double %a, double %b) { +; ; CHECK-LABEL: @test_min_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -729,6 +781,7 @@ define double @test_min_sd_1(double %a, double %b) { declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32) define i8 @test_cmp_ss(<4 x float> %a, <4 x float> %b, i8 %mask) { +; ; CHECK-LABEL: @test_cmp_ss( ; CHECK-NEXT: [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 3, i8 [[MASK:%.*]], i32 4) ; CHECK-NEXT: ret i8 [[TMP1]] @@ -746,6 +799,7 @@ define i8 @test_cmp_ss(<4 x float> %a, <4 x float> %b, i8 %mask) { declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32) define i8 @test_cmp_sd(<2 x double> %a, <2 x double> %b, i8 %mask) { +; ; CHECK-LABEL: @test_cmp_sd( ; CHECK-NEXT: [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 3, i8 [[MASK:%.*]], i32 4) ; CHECK-NEXT: ret i8 [[TMP1]] @@ -757,6 +811,7 @@ define i8 @test_cmp_sd(<2 x double> %a, <2 x double> %b, i8 %mask) { } define i64 @test(float %f, double %d) { +; ; CHECK-LABEL: @test( ; CHECK-NEXT: [[V03:%.*]] = insertelement <4 x float> poison, float [[F:%.*]], i32 0 ; CHECK-NEXT: [[T0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[V03]], i32 4) @@ -837,6 +892,7 @@ declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) declare i64 @llvm.x86.avx512.cvttsd2si64(<2 x double>, i32) define i64 @test2(float %f, double %d) { +; ; CHECK-LABEL: @test2( ; CHECK-NEXT: [[V03:%.*]] = insertelement <4 x float> poison, float [[F:%.*]], i32 0 ; CHECK-NEXT: [[T0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[V03]], i32 4) @@ -919,6 +975,7 @@ declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32) declare float @llvm.fma.f32(float, float, float) #1 define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask_vfmadd_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 @@ -948,6 +1005,7 @@ define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x flo } define float @test_mask_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask_vfmadd_ss_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -974,6 +1032,7 @@ define float @test_mask_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> } define float @test_mask_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask_vfmadd_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -995,6 +1054,7 @@ define float @test_mask_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> declare double @llvm.fma.f64(double, double, double) #1 define <2 x double> @test_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask_vfmadd_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -1020,6 +1080,7 @@ define <2 x double> @test_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x } define double @test_mask_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask_vfmadd_sd_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -1044,6 +1105,7 @@ define double @test_mask_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x doub } define double @test_mask_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask_vfmadd_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -1061,6 +1123,7 @@ define double @test_mask_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x doub } define <4 x float> @test_maskz_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_maskz_vfmadd_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 @@ -1090,6 +1153,7 @@ define <4 x float> @test_maskz_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x fl } define float @test_maskz_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_maskz_vfmadd_ss_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -1116,6 +1180,7 @@ define float @test_maskz_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> } define float @test_maskz_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_maskz_vfmadd_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -1135,6 +1200,7 @@ define float @test_maskz_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> } define <2 x double> @test_maskz_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_maskz_vfmadd_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -1160,6 +1226,7 @@ define <2 x double> @test_maskz_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x } define double @test_maskz_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_maskz_vfmadd_sd_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -1184,6 +1251,7 @@ define double @test_maskz_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x dou } define double @test_maskz_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_maskz_vfmadd_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -1201,6 +1269,7 @@ define double @test_maskz_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x dou } define <4 x float> @test_mask3_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmadd_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 @@ -1230,6 +1299,7 @@ define <4 x float> @test_mask3_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x fl } define float @test_mask3_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmadd_ss_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -1256,6 +1326,7 @@ define float @test_mask3_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> } define float @test_mask3_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmadd_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -1275,6 +1346,7 @@ define float @test_mask3_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> } define <2 x double> @test_mask3_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmadd_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -1300,6 +1372,7 @@ define <2 x double> @test_mask3_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x } define double @test_mask3_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmadd_sd_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -1324,6 +1397,7 @@ define double @test_mask3_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x dou } define double @test_mask3_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmadd_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -1341,6 +1415,7 @@ define double @test_mask3_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x dou } define <4 x float> @test_mask3_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmsub_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 @@ -1374,6 +1449,7 @@ define <4 x float> @test_mask3_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x fl } define float @test_mask3_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmsub_ss_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -1404,6 +1480,7 @@ define float @test_mask3_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> } define float @test_mask3_vfmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmsub_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -1425,6 +1502,7 @@ define float @test_mask3_vfmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> } define float @test_mask3_vfmsub_ss_1_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmsub_ss_1_unary_fneg( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -1446,6 +1524,7 @@ define float @test_mask3_vfmsub_ss_1_unary_fneg(<4 x float> %a, <4 x float> %b, } define <2 x double> @test_mask3_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmsub_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -1475,6 +1554,7 @@ define <2 x double> @test_mask3_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x } define double @test_mask3_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmsub_sd_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -1503,6 +1583,7 @@ define double @test_mask3_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x dou } define double @test_mask3_vfmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmsub_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -1522,6 +1603,7 @@ define double @test_mask3_vfmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x dou } define double @test_mask3_vfmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfmsub_sd_1_unary_fneg( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -1541,6 +1623,7 @@ define double @test_mask3_vfmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double> % } define <4 x float> @test_mask3_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfnmsub_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = fneg float [[TMP1]] @@ -1576,6 +1659,7 @@ define <4 x float> @test_mask3_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x f } define float @test_mask3_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfnmsub_ss_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = fneg float [[TMP1]] @@ -1608,6 +1692,7 @@ define float @test_mask3_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float } define float @test_mask3_vfnmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfnmsub_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -1630,6 +1715,7 @@ define float @test_mask3_vfnmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float } define float @test_mask3_vfnmsub_ss_1_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfnmsub_ss_1_unary_fneg( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -1652,6 +1738,7 @@ define float @test_mask3_vfnmsub_ss_1_unary_fneg(<4 x float> %a, <4 x float> %b, } define <2 x double> @test_mask3_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfnmsub_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = fneg double [[TMP1]] @@ -1683,6 +1770,7 @@ define <2 x double> @test_mask3_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2 } define double @test_mask3_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfnmsub_sd_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = fneg double [[TMP1]] @@ -1713,6 +1801,7 @@ define double @test_mask3_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x do } define double @test_mask3_vfnmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfnmsub_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -1733,6 +1822,7 @@ define double @test_mask3_vfnmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x do } define double @test_mask3_vfnmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mask3_vfnmsub_sd_1_unary_fneg( ; CHECK-NEXT: ret double 1.000000e+00 ; @@ -1755,6 +1845,7 @@ define double @test_mask3_vfnmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double> declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_si_256( ; CHECK-NEXT: ret <8 x i32> [[A0:%.*]] ; @@ -1763,6 +1854,7 @@ define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) { } define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_si_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[A0:%.*]], <8 x i32> [[PASSTHRU:%.*]] @@ -1775,6 +1867,7 @@ define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %pa } define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_si_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: ret <8 x i32> [[TMP1]] @@ -1784,6 +1877,7 @@ define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) { } define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_si_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -1797,6 +1891,7 @@ define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passth } define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_si_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] @@ -1806,6 +1901,7 @@ define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) { } define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_si_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -1819,6 +1915,7 @@ define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %pas } define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_si_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP1]] @@ -1828,6 +1925,7 @@ define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) { } define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_si_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -1843,6 +1941,7 @@ define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passt declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_sf_256( ; CHECK-NEXT: ret <8 x float> [[A0:%.*]] ; @@ -1851,6 +1950,7 @@ define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) { } define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_sf_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[A0:%.*]], <8 x float> [[PASSTHRU:%.*]] @@ -1863,6 +1963,7 @@ define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x floa } define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_sf_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: ret <8 x float> [[TMP1]] @@ -1872,6 +1973,7 @@ define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) { } define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_sf_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -1885,6 +1987,7 @@ define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> % } define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_sf_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> ; CHECK-NEXT: ret <8 x float> [[TMP1]] @@ -1894,6 +1997,7 @@ define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) { } define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_sf_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -1907,6 +2011,7 @@ define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float } define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_sf_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> ; CHECK-NEXT: ret <8 x float> [[TMP1]] @@ -1916,6 +2021,7 @@ define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) { } define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_sf_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -1931,6 +2037,7 @@ define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>) define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_di_256( ; CHECK-NEXT: ret <4 x i64> [[A0:%.*]] ; @@ -1939,6 +2046,7 @@ define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) { } define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_di_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> @@ -1953,6 +2061,7 @@ define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %pa } define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_di_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: ret <4 x i64> [[TMP1]] @@ -1962,6 +2071,7 @@ define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) { } define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_di_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -1977,6 +2087,7 @@ define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passth } define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_di_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i64> [[TMP1]] @@ -1986,6 +2097,7 @@ define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) { } define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_di_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2001,6 +2113,7 @@ define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %pas } define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_di_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i64> [[TMP1]] @@ -2010,6 +2123,7 @@ define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) { } define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_di_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2027,6 +2141,7 @@ define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passt declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>) define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_df_256( ; CHECK-NEXT: ret <4 x double> [[A0:%.*]] ; @@ -2035,6 +2150,7 @@ define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) { } define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_df_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <4 x i32> @@ -2049,6 +2165,7 @@ define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x do } define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_df_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: ret <4 x double> [[TMP1]] @@ -2058,6 +2175,7 @@ define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) { } define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_df_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2073,6 +2191,7 @@ define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double } define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_df_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: ret <4 x double> [[TMP1]] @@ -2082,6 +2201,7 @@ define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) { } define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_df_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2097,6 +2217,7 @@ define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x dou } define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_df_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: ret <4 x double> [[TMP1]] @@ -2106,6 +2227,7 @@ define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) { } define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_df_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2123,6 +2245,7 @@ define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x doubl declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_si_512( ; CHECK-NEXT: ret <16 x i32> [[A0:%.*]] ; @@ -2131,6 +2254,7 @@ define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) { } define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_si_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[A0:%.*]], <16 x i32> [[PASSTHRU:%.*]] @@ -2143,6 +2267,7 @@ define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> } define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_si_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: ret <16 x i32> [[TMP1]] @@ -2152,6 +2277,7 @@ define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) { } define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_si_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2165,6 +2291,7 @@ define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %pas } define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_si_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[TMP1]] @@ -2174,6 +2301,7 @@ define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) { } define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_si_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2187,6 +2315,7 @@ define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> % } define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_si_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[TMP1]] @@ -2196,6 +2325,7 @@ define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) { } define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_si_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2211,6 +2341,7 @@ define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %pa declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>) define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_sf_512( ; CHECK-NEXT: ret <16 x float> [[A0:%.*]] ; @@ -2219,6 +2350,7 @@ define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) { } define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_sf_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x float> [[A0:%.*]], <16 x float> [[PASSTHRU:%.*]] @@ -2231,6 +2363,7 @@ define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x f } define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_sf_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -2240,6 +2373,7 @@ define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) { } define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_sf_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2253,6 +2387,7 @@ define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float } define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_sf_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -2262,6 +2397,7 @@ define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) { } define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_sf_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2275,6 +2411,7 @@ define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x fl } define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_sf_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -2284,6 +2421,7 @@ define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) { } define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_sf_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2299,6 +2437,7 @@ define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x floa declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_di_512( ; CHECK-NEXT: ret <8 x i64> [[A0:%.*]] ; @@ -2307,6 +2446,7 @@ define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) { } define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_di_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[A0:%.*]], <8 x i64> [[PASSTHRU:%.*]] @@ -2319,6 +2459,7 @@ define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %pa } define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_di_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: ret <8 x i64> [[TMP1]] @@ -2328,6 +2469,7 @@ define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) { } define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_di_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2341,6 +2483,7 @@ define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passth } define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_di_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i64> [[TMP1]] @@ -2350,6 +2493,7 @@ define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) { } define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_di_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2363,6 +2507,7 @@ define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %pas } define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_di_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i64> [[TMP1]] @@ -2372,6 +2517,7 @@ define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) { } define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_di_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2387,6 +2533,7 @@ define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passt declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_df_512( ; CHECK-NEXT: ret <8 x double> [[A0:%.*]] ; @@ -2395,6 +2542,7 @@ define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) { } define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_df_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x double> [[A0:%.*]], <8 x double> [[PASSTHRU:%.*]] @@ -2407,6 +2555,7 @@ define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x do } define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_df_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -2416,6 +2565,7 @@ define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) { } define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_df_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2429,6 +2579,7 @@ define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double } define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_df_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -2438,6 +2589,7 @@ define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) { } define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_df_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2451,6 +2603,7 @@ define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x dou } define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_df_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -2460,6 +2613,7 @@ define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) { } define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_df_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2475,6 +2629,7 @@ define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x doubl declare <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16>, <8 x i16>) define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_hi_128( ; CHECK-NEXT: ret <8 x i16> [[A0:%.*]] ; @@ -2483,6 +2638,7 @@ define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) { } define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_hi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A0:%.*]], <8 x i16> [[PASSTHRU:%.*]] @@ -2495,6 +2651,7 @@ define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %pa } define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_hi_128( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: ret <8 x i16> [[TMP1]] @@ -2504,6 +2661,7 @@ define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) { } define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_hi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2517,6 +2675,7 @@ define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passth } define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_hi_128( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[TMP1]] @@ -2526,6 +2685,7 @@ define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) { } define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_hi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2539,6 +2699,7 @@ define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %pas } define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_hi_128( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[TMP1]] @@ -2548,6 +2709,7 @@ define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) { } define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_hi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> poison, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -2563,6 +2725,7 @@ define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passt declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>) define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_hi_256( ; CHECK-NEXT: ret <16 x i16> [[A0:%.*]] ; @@ -2571,6 +2734,7 @@ define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) { } define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_hi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> [[A0:%.*]], <16 x i16> [[PASSTHRU:%.*]] @@ -2583,6 +2747,7 @@ define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> } define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_hi_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: ret <16 x i16> [[TMP1]] @@ -2592,6 +2757,7 @@ define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) { } define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_hi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2605,6 +2771,7 @@ define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %pas } define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_hi_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i16> [[TMP1]] @@ -2614,6 +2781,7 @@ define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) { } define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_hi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2627,6 +2795,7 @@ define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> % } define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_hi_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i16> [[TMP1]] @@ -2636,6 +2805,7 @@ define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) { } define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_hi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2651,6 +2821,7 @@ define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %pa declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>) define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_hi_512( ; CHECK-NEXT: ret <32 x i16> [[A0:%.*]] ; @@ -2659,6 +2830,7 @@ define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) { } define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_hi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i16> [[A0:%.*]], <32 x i16> [[PASSTHRU:%.*]] @@ -2671,6 +2843,7 @@ define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> } define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_hi_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: ret <32 x i16> [[TMP1]] @@ -2680,6 +2853,7 @@ define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) { } define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_hi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> @@ -2693,6 +2867,7 @@ define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %pas } define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_hi_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> ; CHECK-NEXT: ret <32 x i16> [[TMP1]] @@ -2702,6 +2877,7 @@ define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) { } define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_hi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> @@ -2715,6 +2891,7 @@ define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> % } define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_hi_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> ; CHECK-NEXT: ret <32 x i16> [[TMP1]] @@ -2724,6 +2901,7 @@ define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) { } define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_hi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> poison, <32 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> @@ -2739,6 +2917,7 @@ define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %pa declare <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8>, <16 x i8>) define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_qi_128( ; CHECK-NEXT: ret <16 x i8> [[A0:%.*]] ; @@ -2747,6 +2926,7 @@ define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) { } define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_qi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A0:%.*]], <16 x i8> [[PASSTHRU:%.*]] @@ -2759,6 +2939,7 @@ define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %pa } define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_qi_128( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: ret <16 x i8> [[TMP1]] @@ -2768,6 +2949,7 @@ define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) { } define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_qi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2781,6 +2963,7 @@ define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passth } define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_qi_128( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i8> [[TMP1]] @@ -2790,6 +2973,7 @@ define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) { } define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_qi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2803,6 +2987,7 @@ define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %pas } define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_qi_128( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i8> [[TMP1]] @@ -2812,6 +2997,7 @@ define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) { } define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_qi_128_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -2827,6 +3013,7 @@ define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passt declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>) define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_qi_256( ; CHECK-NEXT: ret <32 x i8> [[A0:%.*]] ; @@ -2835,6 +3022,7 @@ define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) { } define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_qi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[A0:%.*]], <32 x i8> [[PASSTHRU:%.*]] @@ -2847,6 +3035,7 @@ define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %pa } define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_qi_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: ret <32 x i8> [[TMP1]] @@ -2856,6 +3045,7 @@ define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) { } define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_qi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> @@ -2869,6 +3059,7 @@ define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passth } define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_qi_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> ; CHECK-NEXT: ret <32 x i8> [[TMP1]] @@ -2878,6 +3069,7 @@ define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) { } define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_qi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> @@ -2891,6 +3083,7 @@ define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %pas } define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_qi_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> ; CHECK-NEXT: ret <32 x i8> [[TMP1]] @@ -2900,6 +3093,7 @@ define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) { } define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_qi_256_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> poison, <32 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> @@ -2915,6 +3109,7 @@ define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passt declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) { +; ; CHECK-LABEL: @identity_test_permvar_qi_512( ; CHECK-NEXT: ret <64 x i8> [[A0:%.*]] ; @@ -2923,6 +3118,7 @@ define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) { } define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { +; ; CHECK-LABEL: @identity_test_permvar_qi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> ; CHECK-NEXT: [[TMP2:%.*]] = select <64 x i1> [[TMP1]], <64 x i8> [[A0:%.*]], <64 x i8> [[PASSTHRU:%.*]] @@ -2935,6 +3131,7 @@ define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %pa } define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) { +; ; CHECK-LABEL: @zero_test_permvar_qi_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer ; CHECK-NEXT: ret <64 x i8> [[TMP1]] @@ -2944,6 +3141,7 @@ define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) { } define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { +; ; CHECK-LABEL: @zero_test_permvar_qi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> @@ -2957,6 +3155,7 @@ define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passth } define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) { +; ; CHECK-LABEL: @shuffle_test_permvar_qi_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> ; CHECK-NEXT: ret <64 x i8> [[TMP1]] @@ -2966,6 +3165,7 @@ define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) { } define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { +; ; CHECK-LABEL: @shuffle_test_permvar_qi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> @@ -2979,6 +3179,7 @@ define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %pas } define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) { +; ; CHECK-LABEL: @undef_test_permvar_qi_512( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> ; CHECK-NEXT: ret <64 x i8> [[TMP1]] @@ -2988,6 +3189,7 @@ define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) { } define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) { +; ; CHECK-LABEL: @undef_test_permvar_qi_512_mask( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> poison, <64 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> @@ -3003,6 +3205,7 @@ define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passt declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32) define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) { +; ; CHECK-LABEL: @test_add_ps( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -3012,6 +3215,7 @@ define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) { } define <16 x float> @test_add_ps_round(<16 x float> %a, <16 x float> %b) { +; ; CHECK-LABEL: @test_add_ps_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -3021,6 +3225,7 @@ define <16 x float> @test_add_ps_round(<16 x float> %a, <16 x float> %b) { } define <16 x float> @test_add_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; ; CHECK-LABEL: @test_add_ps_mask( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -3034,6 +3239,7 @@ define <16 x float> @test_add_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl } define <16 x float> @test_add_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; ; CHECK-LABEL: @test_add_ps_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -3049,6 +3255,7 @@ define <16 x float> @test_add_ps_mask_round(<16 x float> %a, <16 x float> %b, <1 declare <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double>, <8 x double>, i32) define <8 x double> @test_add_pd(<8 x double> %a, <8 x double> %b) { +; ; CHECK-LABEL: @test_add_pd( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -3058,6 +3265,7 @@ define <8 x double> @test_add_pd(<8 x double> %a, <8 x double> %b) { } define <8 x double> @test_add_pd_round(<8 x double> %a, <8 x double> %b) { +; ; CHECK-LABEL: @test_add_pd_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -3067,6 +3275,7 @@ define <8 x double> @test_add_pd_round(<8 x double> %a, <8 x double> %b) { } define <8 x double> @test_add_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_add_pd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -3080,6 +3289,7 @@ define <8 x double> @test_add_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou } define <8 x double> @test_add_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_add_pd_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -3095,6 +3305,7 @@ define <8 x double> @test_add_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32) define <16 x float> @test_sub_ps(<16 x float> %a, <16 x float> %b) { +; ; CHECK-LABEL: @test_sub_ps( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -3104,6 +3315,7 @@ define <16 x float> @test_sub_ps(<16 x float> %a, <16 x float> %b) { } define <16 x float> @test_sub_ps_round(<16 x float> %a, <16 x float> %b) { +; ; CHECK-LABEL: @test_sub_ps_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -3113,6 +3325,7 @@ define <16 x float> @test_sub_ps_round(<16 x float> %a, <16 x float> %b) { } define <16 x float> @test_sub_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; ; CHECK-LABEL: @test_sub_ps_mask( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -3126,6 +3339,7 @@ define <16 x float> @test_sub_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl } define <16 x float> @test_sub_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; ; CHECK-LABEL: @test_sub_ps_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -3141,6 +3355,7 @@ define <16 x float> @test_sub_ps_mask_round(<16 x float> %a, <16 x float> %b, <1 declare <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double>, <8 x double>, i32) define <8 x double> @test_sub_pd(<8 x double> %a, <8 x double> %b) { +; ; CHECK-LABEL: @test_sub_pd( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -3150,6 +3365,7 @@ define <8 x double> @test_sub_pd(<8 x double> %a, <8 x double> %b) { } define <8 x double> @test_sub_pd_round(<8 x double> %a, <8 x double> %b) { +; ; CHECK-LABEL: @test_sub_pd_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -3159,6 +3375,7 @@ define <8 x double> @test_sub_pd_round(<8 x double> %a, <8 x double> %b) { } define <8 x double> @test_sub_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_sub_pd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -3172,6 +3389,7 @@ define <8 x double> @test_sub_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou } define <8 x double> @test_sub_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_sub_pd_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -3187,6 +3405,7 @@ define <8 x double> @test_sub_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32) define <16 x float> @test_mul_ps(<16 x float> %a, <16 x float> %b) { +; ; CHECK-LABEL: @test_mul_ps( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -3196,6 +3415,7 @@ define <16 x float> @test_mul_ps(<16 x float> %a, <16 x float> %b) { } define <16 x float> @test_mul_ps_round(<16 x float> %a, <16 x float> %b) { +; ; CHECK-LABEL: @test_mul_ps_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -3205,6 +3425,7 @@ define <16 x float> @test_mul_ps_round(<16 x float> %a, <16 x float> %b) { } define <16 x float> @test_mul_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; ; CHECK-LABEL: @test_mul_ps_mask( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -3218,6 +3439,7 @@ define <16 x float> @test_mul_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl } define <16 x float> @test_mul_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; ; CHECK-LABEL: @test_mul_ps_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -3233,6 +3455,7 @@ define <16 x float> @test_mul_ps_mask_round(<16 x float> %a, <16 x float> %b, <1 declare <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double>, <8 x double>, i32) define <8 x double> @test_mul_pd(<8 x double> %a, <8 x double> %b) { +; ; CHECK-LABEL: @test_mul_pd( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -3242,6 +3465,7 @@ define <8 x double> @test_mul_pd(<8 x double> %a, <8 x double> %b) { } define <8 x double> @test_mul_pd_round(<8 x double> %a, <8 x double> %b) { +; ; CHECK-LABEL: @test_mul_pd_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -3251,6 +3475,7 @@ define <8 x double> @test_mul_pd_round(<8 x double> %a, <8 x double> %b) { } define <8 x double> @test_mul_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mul_pd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -3264,6 +3489,7 @@ define <8 x double> @test_mul_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou } define <8 x double> @test_mul_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_mul_pd_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -3279,6 +3505,7 @@ define <8 x double> @test_mul_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32) define <16 x float> @test_div_ps(<16 x float> %a, <16 x float> %b) { +; ; CHECK-LABEL: @test_div_ps( ; CHECK-NEXT: [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -3288,6 +3515,7 @@ define <16 x float> @test_div_ps(<16 x float> %a, <16 x float> %b) { } define <16 x float> @test_div_ps_round(<16 x float> %a, <16 x float> %b) { +; ; CHECK-LABEL: @test_div_ps_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <16 x float> [[TMP1]] @@ -3297,6 +3525,7 @@ define <16 x float> @test_div_ps_round(<16 x float> %a, <16 x float> %b) { } define <16 x float> @test_div_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; ; CHECK-LABEL: @test_div_ps_mask( ; CHECK-NEXT: [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -3310,6 +3539,7 @@ define <16 x float> @test_div_ps_mask(<16 x float> %a, <16 x float> %b, <16 x fl } define <16 x float> @test_div_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; ; CHECK-LABEL: @test_div_ps_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> @@ -3325,6 +3555,7 @@ define <16 x float> @test_div_ps_mask_round(<16 x float> %a, <16 x float> %b, <1 declare <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double>, <8 x double>, i32) define <8 x double> @test_div_pd(<8 x double> %a, <8 x double> %b) { +; ; CHECK-LABEL: @test_div_pd( ; CHECK-NEXT: [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -3334,6 +3565,7 @@ define <8 x double> @test_div_pd(<8 x double> %a, <8 x double> %b) { } define <8 x double> @test_div_pd_round(<8 x double> %a, <8 x double> %b) { +; ; CHECK-LABEL: @test_div_pd_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: ret <8 x double> [[TMP1]] @@ -3343,6 +3575,7 @@ define <8 x double> @test_div_pd_round(<8 x double> %a, <8 x double> %b) { } define <8 x double> @test_div_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_div_pd_mask( ; CHECK-NEXT: [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -3356,6 +3589,7 @@ define <8 x double> @test_div_pd_mask(<8 x double> %a, <8 x double> %b, <8 x dou } define <8 x double> @test_div_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { +; ; CHECK-LABEL: @test_div_pd_mask_round( ; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8) ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1> @@ -3371,6 +3605,7 @@ define <8 x double> @test_div_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32) define i32 @test_comi_ss_0(float %a, float %b) { +; ; CHECK-LABEL: @test_comi_ss_0( ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i32 0 @@ -3392,6 +3627,7 @@ define i32 @test_comi_ss_0(float %a, float %b) { declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32) define i32 @test_comi_sd_0(double %a, double %b) { +; ; CHECK-LABEL: @test_comi_sd_0( ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[B:%.*]], i32 0 diff --git a/llvm/test/Transforms/InstCombine/X86/x86-fma.ll b/llvm/test/Transforms/InstCombine/X86/x86-fma.ll index cddb1bf9c4e00a..959ee7efa55717 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-fma.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-fma.ll @@ -3,6 +3,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define <4 x float> @test_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; ; CHECK-LABEL: @test_vfmadd_ss( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 @@ -26,6 +27,7 @@ define <4 x float> @test_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> % } define float @test_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; ; CHECK-LABEL: @test_vfmadd_ss_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 @@ -46,6 +48,7 @@ define float @test_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) { } define float @test_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; ; CHECK-LABEL: @test_vfmadd_ss_1( ; CHECK-NEXT: ret float 1.000000e+00 ; @@ -62,6 +65,7 @@ define float @test_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) { } define <2 x double> @test_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) { +; ; CHECK-LABEL: @test_vfmadd_sd( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -81,6 +85,7 @@ define <2 x double> @test_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x doubl } define double @test_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c) { +; ; CHECK-LABEL: @test_vfmadd_sd_0( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 @@ -99,6 +104,7 @@ define double @test_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> % } define double @test_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c) { +; ; CHECK-LABEL: @test_vfmadd_sd_1( ; CHECK-NEXT: ret double 1.000000e+00 ; diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vec_demanded_elts-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-vec_demanded_elts-inseltpoison.ll index 719389470faca4..5cab62364c24d9 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-vec_demanded_elts-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-vec_demanded_elts-inseltpoison.ll @@ -1,12 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define i16 @test1(float %f) { +; ; CHECK-LABEL: @test1( -; CHECK-NEXT: [[TMP281:%.*]] = fadd float %f, -1.000000e+00 -; CHECK-NEXT: [[TMP373:%.*]] = fmul float [[TMP281]], 5.000000e-01 -; CHECK-NEXT: [[TMP374:%.*]] = insertelement <4 x float> poison, float [[TMP373]], i32 0 -; CHECK-NEXT: [[TMP48:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> [[TMP374]], <4 x float> ) +; CHECK-NEXT: [[TMP1:%.*]] = fadd float [[F:%.*]], -1.000000e+00 +; CHECK-NEXT: [[TMP2:%.*]] = fmul float [[TMP1]], 5.000000e-01 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP48:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> [[TMP3]], <4 x float> ) ; CHECK-NEXT: [[TMP59:%.*]] = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> [[TMP48]], <4 x float> ) ; CHECK-NEXT: [[TMP_UPGRD_1:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[TMP59]]) ; CHECK-NEXT: [[TMP69:%.*]] = trunc i32 [[TMP_UPGRD_1]] to i16 @@ -26,23 +28,24 @@ define i16 @test1(float %f) { } define i64 @test3(float %f, double %d) { +; ; CHECK-LABEL: @test3( -; CHECK-NEXT: [[V00:%.*]] = insertelement <4 x float> poison, float %f, i32 0 -; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> [[V00]]) -; CHECK-NEXT: [[V10:%.*]] = insertelement <4 x float> poison, float %f, i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> [[V10]]) -; CHECK-NEXT: [[V20:%.*]] = insertelement <4 x float> poison, float %f, i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[V20]]) -; CHECK-NEXT: [[V30:%.*]] = insertelement <4 x float> poison, float %f, i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> [[V30]]) -; CHECK-NEXT: [[V40:%.*]] = insertelement <2 x double> poison, double %d, i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> [[V40]]) -; CHECK-NEXT: [[V50:%.*]] = insertelement <2 x double> poison, double %d, i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> [[V50]]) -; CHECK-NEXT: [[V60:%.*]] = insertelement <2 x double> poison, double %d, i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> [[V60]]) -; CHECK-NEXT: [[V70:%.*]] = insertelement <2 x double> poison, double %d, i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> [[V70]]) +; CHECK-NEXT: [[V03:%.*]] = insertelement <4 x float> poison, float [[F:%.*]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> [[V03]]) +; CHECK-NEXT: [[V13:%.*]] = insertelement <4 x float> poison, float [[F]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> [[V13]]) +; CHECK-NEXT: [[V23:%.*]] = insertelement <4 x float> poison, float [[F]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[V23]]) +; CHECK-NEXT: [[V33:%.*]] = insertelement <4 x float> poison, float [[F]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> [[V33]]) +; CHECK-NEXT: [[V41:%.*]] = insertelement <2 x double> poison, double [[D:%.*]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> [[V41]]) +; CHECK-NEXT: [[V51:%.*]] = insertelement <2 x double> poison, double [[D]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> [[V51]]) +; CHECK-NEXT: [[V61:%.*]] = insertelement <2 x double> poison, double [[D]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> [[V61]]) +; CHECK-NEXT: [[V71:%.*]] = insertelement <2 x double> poison, double [[D]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> [[V71]]) ; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP2]] ; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP4]], [[TMP6]] ; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]] diff --git a/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll b/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll index a7b233e5f71bb7..eba990b3c7965f 100644 --- a/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/scalarization-inseltpoison.ll @@ -2,6 +2,7 @@ ; RUN: opt -instcombine -S < %s | FileCheck %s define i32 @extract_load(<4 x i32>* %p) { +; ; CHECK-LABEL: @extract_load( ; CHECK-NEXT: [[X:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4 ; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x i32> [[X]], i32 1 @@ -13,6 +14,7 @@ define i32 @extract_load(<4 x i32>* %p) { } define double @extract_load_fp(<4 x double>* %p) { +; ; CHECK-LABEL: @extract_load_fp( ; CHECK-NEXT: [[X:%.*]] = load <4 x double>, <4 x double>* [[P:%.*]], align 32 ; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 3 @@ -24,6 +26,7 @@ define double @extract_load_fp(<4 x double>* %p) { } define double @extract_load_volatile(<4 x double>* %p) { +; ; CHECK-LABEL: @extract_load_volatile( ; CHECK-NEXT: [[X:%.*]] = load volatile <4 x double>, <4 x double>* [[P:%.*]], align 32 ; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 2 @@ -35,6 +38,7 @@ define double @extract_load_volatile(<4 x double>* %p) { } define double @extract_load_extra_use(<4 x double>* %p, <4 x double>* %p2) { +; ; CHECK-LABEL: @extract_load_extra_use( ; CHECK-NEXT: [[X:%.*]] = load <4 x double>, <4 x double>* [[P:%.*]], align 8 ; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 0 @@ -48,6 +52,7 @@ define double @extract_load_extra_use(<4 x double>* %p, <4 x double>* %p2) { } define double @extract_load_variable_index(<4 x double>* %p, i32 %y) { +; ; CHECK-LABEL: @extract_load_variable_index( ; CHECK-NEXT: [[X:%.*]] = load <4 x double>, <4 x double>* [[P:%.*]], align 32 ; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 [[Y:%.*]] @@ -59,6 +64,7 @@ define double @extract_load_variable_index(<4 x double>* %p, i32 %y) { } define void @scalarize_phi(i32 * %n, float * %inout) { +; ; CHECK-LABEL: @scalarize_phi( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[T0:%.*]] = load volatile float, float* [[INOUT:%.*]], align 4 @@ -103,6 +109,7 @@ for.end: } define float @extract_element_binop_splat_constant_index(<4 x float> %x) { +; ; CHECK-LABEL: @extract_element_binop_splat_constant_index( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2 ; CHECK-NEXT: [[R:%.*]] = fadd float [[TMP1]], 0x4002A3D700000000 @@ -114,6 +121,7 @@ define float @extract_element_binop_splat_constant_index(<4 x float> %x) { } define double @extract_element_binop_splat_with_undef_constant_index(<2 x double> %x) { +; ; CHECK-LABEL: @extract_element_binop_splat_with_undef_constant_index( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0 ; CHECK-NEXT: [[R:%.*]] = fdiv double 4.200000e+01, [[TMP1]] @@ -125,6 +133,7 @@ define double @extract_element_binop_splat_with_undef_constant_index(<2 x double } define float @extract_element_binop_nonsplat_constant_index(<2 x float> %x) { +; ; CHECK-LABEL: @extract_element_binop_nonsplat_constant_index( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1 ; CHECK-NEXT: [[R:%.*]] = fmul float [[TMP1]], 4.300000e+01 @@ -136,6 +145,7 @@ define float @extract_element_binop_nonsplat_constant_index(<2 x float> %x) { } define i8 @extract_element_binop_splat_variable_index(<4 x i8> %x, i32 %y) { +; ; CHECK-LABEL: @extract_element_binop_splat_variable_index( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = sdiv i8 [[TMP1]], 42 @@ -147,6 +157,7 @@ define i8 @extract_element_binop_splat_variable_index(<4 x i8> %x, i32 %y) { } define i8 @extract_element_binop_splat_with_undef_variable_index(<4 x i8> %x, i32 %y) { +; ; CHECK-LABEL: @extract_element_binop_splat_with_undef_variable_index( ; CHECK-NEXT: [[B:%.*]] = mul <4 x i8> [[X:%.*]], ; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i8> [[B]], i32 [[Y:%.*]] @@ -158,6 +169,7 @@ define i8 @extract_element_binop_splat_with_undef_variable_index(<4 x i8> %x, i3 } define i8 @extract_element_binop_nonsplat_variable_index(<4 x i8> %x, i32 %y) { +; ; CHECK-LABEL: @extract_element_binop_nonsplat_variable_index( ; CHECK-NEXT: [[B:%.*]] = lshr <4 x i8> [[X:%.*]], ; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i8> [[B]], i32 [[Y:%.*]] @@ -169,6 +181,7 @@ define i8 @extract_element_binop_nonsplat_variable_index(<4 x i8> %x, i32 %y) { } define float @extract_element_load(<4 x float> %x, <4 x float>* %ptr) { +; ; CHECK-LABEL: @extract_element_load( ; CHECK-NEXT: [[LOAD:%.*]] = load <4 x float>, <4 x float>* [[PTR:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[LOAD]], i32 2 @@ -183,6 +196,7 @@ define float @extract_element_load(<4 x float> %x, <4 x float>* %ptr) { } define float @extract_element_multi_Use_load(<4 x float> %x, <4 x float>* %ptr0, <4 x float>* %ptr1) { +; ; CHECK-LABEL: @extract_element_multi_Use_load( ; CHECK-NEXT: [[LOAD:%.*]] = load <4 x float>, <4 x float>* [[PTR0:%.*]], align 16 ; CHECK-NEXT: store <4 x float> [[LOAD]], <4 x float>* [[PTR1:%.*]], align 16 @@ -198,6 +212,7 @@ define float @extract_element_multi_Use_load(<4 x float> %x, <4 x float>* %ptr0, } define float @extract_element_variable_index(<4 x float> %x, i32 %y) { +; ; CHECK-LABEL: @extract_element_variable_index( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = fadd float [[TMP1]], 1.000000e+00 @@ -209,6 +224,7 @@ define float @extract_element_variable_index(<4 x float> %x, i32 %y) { } define float @extelt_binop_insertelt(<4 x float> %A, <4 x float> %B, float %f) { +; ; CHECK-LABEL: @extelt_binop_insertelt( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 ; CHECK-NEXT: [[E:%.*]] = fmul nnan float [[TMP1]], [[F:%.*]] @@ -224,6 +240,7 @@ define float @extelt_binop_insertelt(<4 x float> %A, <4 x float> %B, float %f) { ; FIXME: We should propagate the IR flags including wrapping flags. define i32 @extelt_binop_binop_insertelt(<4 x i32> %A, <4 x i32> %B, i32 %f) { +; ; CHECK-LABEL: @extelt_binop_binop_insertelt( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], [[F:%.*]] @@ -239,6 +256,7 @@ define i32 @extelt_binop_binop_insertelt(<4 x i32> %A, <4 x i32> %B, i32 %f) { } define float @extract_element_constant_vector_variable_index(i32 %y) { +; ; CHECK-LABEL: @extract_element_constant_vector_variable_index( ; CHECK-NEXT: [[R:%.*]] = extractelement <4 x float> , i32 [[Y:%.*]] ; CHECK-NEXT: ret float [[R]] @@ -248,6 +266,7 @@ define float @extract_element_constant_vector_variable_index(i32 %y) { } define i1 @cheap_to_extract_icmp(<4 x i32> %x, <4 x i1> %y) { +; ; CHECK-LABEL: @cheap_to_extract_icmp( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 @@ -262,6 +281,7 @@ define i1 @cheap_to_extract_icmp(<4 x i32> %x, <4 x i1> %y) { } define i1 @cheap_to_extract_fcmp(<4 x float> %x, <4 x i1> %y) { +; ; CHECK-LABEL: @cheap_to_extract_fcmp( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2 ; CHECK-NEXT: [[TMP2:%.*]] = fcmp oeq float [[TMP1]], 0.000000e+00 @@ -276,6 +296,7 @@ define i1 @cheap_to_extract_fcmp(<4 x float> %x, <4 x i1> %y) { } define i1 @extractelt_vector_icmp_constrhs(<2 x i32> %arg) { +; ; CHECK-LABEL: @extractelt_vector_icmp_constrhs( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[ARG:%.*]], i32 0 ; CHECK-NEXT: [[EXT:%.*]] = icmp eq i32 [[TMP1]], 0 @@ -287,6 +308,7 @@ define i1 @extractelt_vector_icmp_constrhs(<2 x i32> %arg) { } define i1 @extractelt_vector_fcmp_constrhs(<2 x float> %arg) { +; ; CHECK-LABEL: @extractelt_vector_fcmp_constrhs( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[ARG:%.*]], i32 0 ; CHECK-NEXT: [[EXT:%.*]] = fcmp oeq float [[TMP1]], 0.000000e+00 @@ -298,6 +320,7 @@ define i1 @extractelt_vector_fcmp_constrhs(<2 x float> %arg) { } define i1 @extractelt_vector_icmp_constrhs_dynidx(<2 x i32> %arg, i32 %idx) { +; ; CHECK-LABEL: @extractelt_vector_icmp_constrhs_dynidx( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[ARG:%.*]], i32 [[IDX:%.*]] ; CHECK-NEXT: [[EXT:%.*]] = icmp eq i32 [[TMP1]], 0 @@ -309,6 +332,7 @@ define i1 @extractelt_vector_icmp_constrhs_dynidx(<2 x i32> %arg, i32 %idx) { } define i1 @extractelt_vector_fcmp_constrhs_dynidx(<2 x float> %arg, i32 %idx) { +; ; CHECK-LABEL: @extractelt_vector_fcmp_constrhs_dynidx( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[ARG:%.*]], i32 [[IDX:%.*]] ; CHECK-NEXT: [[EXT:%.*]] = fcmp oeq float [[TMP1]], 0.000000e+00 @@ -320,6 +344,7 @@ define i1 @extractelt_vector_fcmp_constrhs_dynidx(<2 x float> %arg, i32 %idx) { } define i1 @extractelt_vector_fcmp_not_cheap_to_scalarize_multi_use(<2 x float> %arg0, <2 x float> %arg1, <2 x float> %arg2, i32 %idx) { +; ; CHECK-LABEL: @extractelt_vector_fcmp_not_cheap_to_scalarize_multi_use( ; CHECK-NEXT: [[ADD:%.*]] = fadd <2 x float> [[ARG1:%.*]], [[ARG2:%.*]] ; CHECK-NEXT: store volatile <2 x float> [[ADD]], <2 x float>* undef, align 8 diff --git a/llvm/test/Transforms/InstCombine/scalarization.ll b/llvm/test/Transforms/InstCombine/scalarization.ll index 08cc534b76f8e2..2bbba7966ad185 100644 --- a/llvm/test/Transforms/InstCombine/scalarization.ll +++ b/llvm/test/Transforms/InstCombine/scalarization.ll @@ -2,6 +2,7 @@ ; RUN: opt -instcombine -S < %s | FileCheck %s define i32 @extract_load(<4 x i32>* %p) { +; ; CHECK-LABEL: @extract_load( ; CHECK-NEXT: [[X:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4 ; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x i32> [[X]], i32 1 @@ -13,6 +14,7 @@ define i32 @extract_load(<4 x i32>* %p) { } define double @extract_load_fp(<4 x double>* %p) { +; ; CHECK-LABEL: @extract_load_fp( ; CHECK-NEXT: [[X:%.*]] = load <4 x double>, <4 x double>* [[P:%.*]], align 32 ; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 3 @@ -24,6 +26,7 @@ define double @extract_load_fp(<4 x double>* %p) { } define double @extract_load_volatile(<4 x double>* %p) { +; ; CHECK-LABEL: @extract_load_volatile( ; CHECK-NEXT: [[X:%.*]] = load volatile <4 x double>, <4 x double>* [[P:%.*]], align 32 ; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 2 @@ -35,6 +38,7 @@ define double @extract_load_volatile(<4 x double>* %p) { } define double @extract_load_extra_use(<4 x double>* %p, <4 x double>* %p2) { +; ; CHECK-LABEL: @extract_load_extra_use( ; CHECK-NEXT: [[X:%.*]] = load <4 x double>, <4 x double>* [[P:%.*]], align 8 ; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 0 @@ -48,6 +52,7 @@ define double @extract_load_extra_use(<4 x double>* %p, <4 x double>* %p2) { } define double @extract_load_variable_index(<4 x double>* %p, i32 %y) { +; ; CHECK-LABEL: @extract_load_variable_index( ; CHECK-NEXT: [[X:%.*]] = load <4 x double>, <4 x double>* [[P:%.*]], align 32 ; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x double> [[X]], i32 [[Y:%.*]] @@ -59,6 +64,7 @@ define double @extract_load_variable_index(<4 x double>* %p, i32 %y) { } define void @scalarize_phi(i32 * %n, float * %inout) { +; ; CHECK-LABEL: @scalarize_phi( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[T0:%.*]] = load volatile float, float* [[INOUT:%.*]], align 4 @@ -103,6 +109,7 @@ for.end: } define float @extract_element_binop_splat_constant_index(<4 x float> %x) { +; ; CHECK-LABEL: @extract_element_binop_splat_constant_index( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2 ; CHECK-NEXT: [[R:%.*]] = fadd float [[TMP1]], 0x4002A3D700000000 @@ -114,6 +121,7 @@ define float @extract_element_binop_splat_constant_index(<4 x float> %x) { } define double @extract_element_binop_splat_with_undef_constant_index(<2 x double> %x) { +; ; CHECK-LABEL: @extract_element_binop_splat_with_undef_constant_index( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0 ; CHECK-NEXT: [[R:%.*]] = fdiv double 4.200000e+01, [[TMP1]] @@ -125,6 +133,7 @@ define double @extract_element_binop_splat_with_undef_constant_index(<2 x double } define float @extract_element_binop_nonsplat_constant_index(<2 x float> %x) { +; ; CHECK-LABEL: @extract_element_binop_nonsplat_constant_index( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1 ; CHECK-NEXT: [[R:%.*]] = fmul float [[TMP1]], 4.300000e+01 @@ -136,6 +145,7 @@ define float @extract_element_binop_nonsplat_constant_index(<2 x float> %x) { } define i8 @extract_element_binop_splat_variable_index(<4 x i8> %x, i32 %y) { +; ; CHECK-LABEL: @extract_element_binop_splat_variable_index( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = sdiv i8 [[TMP1]], 42 @@ -147,6 +157,7 @@ define i8 @extract_element_binop_splat_variable_index(<4 x i8> %x, i32 %y) { } define i8 @extract_element_binop_splat_with_undef_variable_index(<4 x i8> %x, i32 %y) { +; ; CHECK-LABEL: @extract_element_binop_splat_with_undef_variable_index( ; CHECK-NEXT: [[B:%.*]] = mul <4 x i8> [[X:%.*]], ; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i8> [[B]], i32 [[Y:%.*]] @@ -158,6 +169,7 @@ define i8 @extract_element_binop_splat_with_undef_variable_index(<4 x i8> %x, i3 } define i8 @extract_element_binop_nonsplat_variable_index(<4 x i8> %x, i32 %y) { +; ; CHECK-LABEL: @extract_element_binop_nonsplat_variable_index( ; CHECK-NEXT: [[B:%.*]] = lshr <4 x i8> [[X:%.*]], ; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i8> [[B]], i32 [[Y:%.*]] @@ -169,6 +181,7 @@ define i8 @extract_element_binop_nonsplat_variable_index(<4 x i8> %x, i32 %y) { } define float @extract_element_load(<4 x float> %x, <4 x float>* %ptr) { +; ; CHECK-LABEL: @extract_element_load( ; CHECK-NEXT: [[LOAD:%.*]] = load <4 x float>, <4 x float>* [[PTR:%.*]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[LOAD]], i32 2 @@ -183,6 +196,7 @@ define float @extract_element_load(<4 x float> %x, <4 x float>* %ptr) { } define float @extract_element_multi_Use_load(<4 x float> %x, <4 x float>* %ptr0, <4 x float>* %ptr1) { +; ; CHECK-LABEL: @extract_element_multi_Use_load( ; CHECK-NEXT: [[LOAD:%.*]] = load <4 x float>, <4 x float>* [[PTR0:%.*]], align 16 ; CHECK-NEXT: store <4 x float> [[LOAD]], <4 x float>* [[PTR1:%.*]], align 16 @@ -198,6 +212,7 @@ define float @extract_element_multi_Use_load(<4 x float> %x, <4 x float>* %ptr0, } define float @extract_element_variable_index(<4 x float> %x, i32 %y) { +; ; CHECK-LABEL: @extract_element_variable_index( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = fadd float [[TMP1]], 1.000000e+00 @@ -209,6 +224,7 @@ define float @extract_element_variable_index(<4 x float> %x, i32 %y) { } define float @extelt_binop_insertelt(<4 x float> %A, <4 x float> %B, float %f) { +; ; CHECK-LABEL: @extelt_binop_insertelt( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 ; CHECK-NEXT: [[E:%.*]] = fmul nnan float [[TMP1]], [[F:%.*]] @@ -222,6 +238,7 @@ define float @extelt_binop_insertelt(<4 x float> %A, <4 x float> %B, float %f) { ; We recurse to find a scalarizable operand. define i32 @extelt_binop_binop_insertelt(<4 x i32> %A, <4 x i32> %B, i32 %f) { +; ; CHECK-LABEL: @extelt_binop_binop_insertelt( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], [[F:%.*]] @@ -237,6 +254,7 @@ define i32 @extelt_binop_binop_insertelt(<4 x i32> %A, <4 x i32> %B, i32 %f) { } define float @extract_element_constant_vector_variable_index(i32 %y) { +; ; CHECK-LABEL: @extract_element_constant_vector_variable_index( ; CHECK-NEXT: [[R:%.*]] = extractelement <4 x float> , i32 [[Y:%.*]] ; CHECK-NEXT: ret float [[R]] @@ -246,6 +264,7 @@ define float @extract_element_constant_vector_variable_index(i32 %y) { } define i1 @cheap_to_extract_icmp(<4 x i32> %x, <4 x i1> %y) { +; ; CHECK-LABEL: @cheap_to_extract_icmp( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 @@ -260,6 +279,7 @@ define i1 @cheap_to_extract_icmp(<4 x i32> %x, <4 x i1> %y) { } define i1 @cheap_to_extract_fcmp(<4 x float> %x, <4 x i1> %y) { +; ; CHECK-LABEL: @cheap_to_extract_fcmp( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2 ; CHECK-NEXT: [[TMP2:%.*]] = fcmp oeq float [[TMP1]], 0.000000e+00 @@ -274,6 +294,7 @@ define i1 @cheap_to_extract_fcmp(<4 x float> %x, <4 x i1> %y) { } define i1 @extractelt_vector_icmp_constrhs(<2 x i32> %arg) { +; ; CHECK-LABEL: @extractelt_vector_icmp_constrhs( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[ARG:%.*]], i32 0 ; CHECK-NEXT: [[EXT:%.*]] = icmp eq i32 [[TMP1]], 0 @@ -285,6 +306,7 @@ define i1 @extractelt_vector_icmp_constrhs(<2 x i32> %arg) { } define i1 @extractelt_vector_fcmp_constrhs(<2 x float> %arg) { +; ; CHECK-LABEL: @extractelt_vector_fcmp_constrhs( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[ARG:%.*]], i32 0 ; CHECK-NEXT: [[EXT:%.*]] = fcmp oeq float [[TMP1]], 0.000000e+00 @@ -296,6 +318,7 @@ define i1 @extractelt_vector_fcmp_constrhs(<2 x float> %arg) { } define i1 @extractelt_vector_icmp_constrhs_dynidx(<2 x i32> %arg, i32 %idx) { +; ; CHECK-LABEL: @extractelt_vector_icmp_constrhs_dynidx( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[ARG:%.*]], i32 [[IDX:%.*]] ; CHECK-NEXT: [[EXT:%.*]] = icmp eq i32 [[TMP1]], 0 @@ -307,6 +330,7 @@ define i1 @extractelt_vector_icmp_constrhs_dynidx(<2 x i32> %arg, i32 %idx) { } define i1 @extractelt_vector_fcmp_constrhs_dynidx(<2 x float> %arg, i32 %idx) { +; ; CHECK-LABEL: @extractelt_vector_fcmp_constrhs_dynidx( ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[ARG:%.*]], i32 [[IDX:%.*]] ; CHECK-NEXT: [[EXT:%.*]] = fcmp oeq float [[TMP1]], 0.000000e+00 @@ -318,6 +342,7 @@ define i1 @extractelt_vector_fcmp_constrhs_dynidx(<2 x float> %arg, i32 %idx) { } define i1 @extractelt_vector_fcmp_not_cheap_to_scalarize_multi_use(<2 x float> %arg0, <2 x float> %arg1, <2 x float> %arg2, i32 %idx) { +; ; CHECK-LABEL: @extractelt_vector_fcmp_not_cheap_to_scalarize_multi_use( ; CHECK-NEXT: [[ADD:%.*]] = fadd <2 x float> [[ARG1:%.*]], [[ARG2:%.*]] ; CHECK-NEXT: store volatile <2 x float> [[ADD]], <2 x float>* undef, align 8 diff --git a/llvm/test/Transforms/InstCombine/sincospi.ll b/llvm/test/Transforms/InstCombine/sincospi.ll index 10342c50096179..51a18bc33e28fa 100644 --- a/llvm/test/Transforms/InstCombine/sincospi.ll +++ b/llvm/test/Transforms/InstCombine/sincospi.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -instcombine -S < %s -mtriple=x86_64-apple-macosx10.9 | FileCheck %s --check-prefix=CHECK-FLOAT-IN-VEC ; RUN: opt -instcombine -S < %s -mtriple=arm-apple-ios7.0 | FileCheck %s ; RUN: opt -instcombine -S < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s @@ -9,93 +10,175 @@ attributes #0 = { readnone nounwind } declare float @__sinpif(float %x) #0 -declare float @__cospif(float %x) #0 +declare float @__cospif(float %x) #0 declare double @__sinpi(double %x) #0 -declare double @__cospi(double %x) #0 +declare double @__cospi(double %x) #0 @var32 = global float 0.0 @var64 = global double 0.0 define float @test_instbased_f32() { - %val = load float, float* @var32 - %sin = call float @__sinpif(float %val) #0 - %cos = call float @__cospif(float %val) #0 - %res = fadd float %sin, %cos - ret float %res -; CHECK-FLOAT-IN-VEC: [[VAL:%[a-z0-9]+]] = load float, float* @var32 -; CHECK-FLOAT-IN-VEC: [[SINCOS:%[a-z0-9]+]] = call <2 x float> @__sincospif_stret(float [[VAL]]) -; CHECK-FLOAT-IN-VEC: extractelement <2 x float> [[SINCOS]], i32 0 -; CHECK-FLOAT-IN-VEC: extractelement <2 x float> [[SINCOS]], i32 1 - -; CHECK: [[VAL:%[a-z0-9]+]] = load float, float* @var32 -; CHECK: [[SINCOS:%[a-z0-9]+]] = call { float, float } @__sincospif_stret(float [[VAL]]) -; CHECK: extractvalue { float, float } [[SINCOS]], 0 -; CHECK: extractvalue { float, float } [[SINCOS]], 1 - -; CHECK-NO-SINCOS: call float @__sinpif -; CHECK-NO-SINCOS: call float @__cospif +; CHECK-FLOAT-IN-VEC-LABEL: @test_instbased_f32( +; CHECK-FLOAT-IN-VEC-NEXT: [[VAL:%.*]] = load float, float* @var32, align 4 +; CHECK-FLOAT-IN-VEC-NEXT: [[SINCOSPI:%.*]] = call <2 x float> @__sincospif_stret(float [[VAL]]) +; CHECK-FLOAT-IN-VEC-NEXT: [[SINPI:%.*]] = extractelement <2 x float> [[SINCOSPI]], i32 0 +; CHECK-FLOAT-IN-VEC-NEXT: [[COSPI:%.*]] = extractelement <2 x float> [[SINCOSPI]], i32 1 +; CHECK-FLOAT-IN-VEC-NEXT: [[SIN:%.*]] = call float @__sinpif(float [[VAL]]) #[[ATTR0:[0-9]+]] +; CHECK-FLOAT-IN-VEC-NEXT: [[COS:%.*]] = call float @__cospif(float [[VAL]]) #[[ATTR0]] +; CHECK-FLOAT-IN-VEC-NEXT: [[RES:%.*]] = fadd float [[SINPI]], [[COSPI]] +; CHECK-FLOAT-IN-VEC-NEXT: ret float [[RES]] +; +; CHECK-LABEL: @test_instbased_f32( +; CHECK-NEXT: [[VAL:%.*]] = load float, float* @var32, align 4 +; CHECK-NEXT: [[SINCOSPI:%.*]] = call { float, float } @__sincospif_stret(float [[VAL]]) +; CHECK-NEXT: [[SINPI:%.*]] = extractvalue { float, float } [[SINCOSPI]], 0 +; CHECK-NEXT: [[COSPI:%.*]] = extractvalue { float, float } [[SINCOSPI]], 1 +; CHECK-NEXT: [[SIN:%.*]] = call float @__sinpif(float [[VAL]]) #[[ATTR0:[0-9]+]] +; CHECK-NEXT: [[COS:%.*]] = call float @__cospif(float [[VAL]]) #[[ATTR0]] +; CHECK-NEXT: [[RES:%.*]] = fadd float [[SINPI]], [[COSPI]] +; CHECK-NEXT: ret float [[RES]] +; +; CHECK-NO-SINCOS-LABEL: @test_instbased_f32( +; CHECK-NO-SINCOS-NEXT: [[VAL:%.*]] = load float, float* @var32, align 4 +; CHECK-NO-SINCOS-NEXT: [[SIN:%.*]] = call float @__sinpif(float [[VAL]]) #[[ATTR0:[0-9]+]] +; CHECK-NO-SINCOS-NEXT: [[COS:%.*]] = call float @__cospif(float [[VAL]]) #[[ATTR0]] +; CHECK-NO-SINCOS-NEXT: [[RES:%.*]] = fadd float [[SIN]], [[COS]] +; CHECK-NO-SINCOS-NEXT: ret float [[RES]] +; + %val = load float, float* @var32 + %sin = call float @__sinpif(float %val) #0 + %cos = call float @__cospif(float %val) #0 + %res = fadd float %sin, %cos + ret float %res + + } define float @test_constant_f32() { - %sin = call float @__sinpif(float 1.0) #0 - %cos = call float @__cospif(float 1.0) #0 - %res = fadd float %sin, %cos - ret float %res -; CHECK-FLOAT-IN-VEC: [[SINCOS:%[a-z0-9]+]] = call <2 x float> @__sincospif_stret(float 1.000000e+00) -; CHECK-FLOAT-IN-VEC: extractelement <2 x float> [[SINCOS]], i32 0 -; CHECK-FLOAT-IN-VEC: extractelement <2 x float> [[SINCOS]], i32 1 - -; CHECK: [[SINCOS:%[a-z0-9]+]] = call { float, float } @__sincospif_stret(float 1.000000e+00) -; CHECK: extractvalue { float, float } [[SINCOS]], 0 -; CHECK: extractvalue { float, float } [[SINCOS]], 1 - -; CHECK-NO-SINCOS: call float @__sinpif -; CHECK-NO-SINCOS: call float @__cospif +; CHECK-FLOAT-IN-VEC-LABEL: @test_constant_f32( +; CHECK-FLOAT-IN-VEC-NEXT: [[SINCOSPI:%.*]] = call <2 x float> @__sincospif_stret(float 1.000000e+00) +; CHECK-FLOAT-IN-VEC-NEXT: [[SINPI:%.*]] = extractelement <2 x float> [[SINCOSPI]], i32 0 +; CHECK-FLOAT-IN-VEC-NEXT: [[COSPI:%.*]] = extractelement <2 x float> [[SINCOSPI]], i32 1 +; CHECK-FLOAT-IN-VEC-NEXT: [[SIN:%.*]] = call float @__sinpif(float 1.000000e+00) #[[ATTR0]] +; CHECK-FLOAT-IN-VEC-NEXT: [[COS:%.*]] = call float @__cospif(float 1.000000e+00) #[[ATTR0]] +; CHECK-FLOAT-IN-VEC-NEXT: [[RES:%.*]] = fadd float [[SINPI]], [[COSPI]] +; CHECK-FLOAT-IN-VEC-NEXT: ret float [[RES]] +; +; CHECK-LABEL: @test_constant_f32( +; CHECK-NEXT: [[SINCOSPI:%.*]] = call { float, float } @__sincospif_stret(float 1.000000e+00) +; CHECK-NEXT: [[SINPI:%.*]] = extractvalue { float, float } [[SINCOSPI]], 0 +; CHECK-NEXT: [[COSPI:%.*]] = extractvalue { float, float } [[SINCOSPI]], 1 +; CHECK-NEXT: [[SIN:%.*]] = call float @__sinpif(float 1.000000e+00) #[[ATTR0]] +; CHECK-NEXT: [[COS:%.*]] = call float @__cospif(float 1.000000e+00) #[[ATTR0]] +; CHECK-NEXT: [[RES:%.*]] = fadd float [[SINPI]], [[COSPI]] +; CHECK-NEXT: ret float [[RES]] +; +; CHECK-NO-SINCOS-LABEL: @test_constant_f32( +; CHECK-NO-SINCOS-NEXT: [[SIN:%.*]] = call float @__sinpif(float 1.000000e+00) #[[ATTR0]] +; CHECK-NO-SINCOS-NEXT: [[COS:%.*]] = call float @__cospif(float 1.000000e+00) #[[ATTR0]] +; CHECK-NO-SINCOS-NEXT: [[RES:%.*]] = fadd float [[SIN]], [[COS]] +; CHECK-NO-SINCOS-NEXT: ret float [[RES]] +; + %sin = call float @__sinpif(float 1.0) #0 + %cos = call float @__cospif(float 1.0) #0 + %res = fadd float %sin, %cos + ret float %res + + } define double @test_instbased_f64() { - %val = load double, double* @var64 - %sin = call double @__sinpi(double %val) #0 - %cos = call double @__cospi(double %val) #0 - %res = fadd double %sin, %cos - ret double %res -; CHECK-FLOAT-IN-VEC: [[VAL:%[a-z0-9]+]] = load double, double* @var64 -; CHECK-FLOAT-IN-VEC: [[SINCOS:%[a-z0-9]+]] = call { double, double } @__sincospi_stret(double [[VAL]]) -; CHECK-FLOAT-IN-VEC: extractvalue { double, double } [[SINCOS]], 0 -; CHECK-FLOAT-IN-VEC: extractvalue { double, double } [[SINCOS]], 1 - -; CHECK: [[VAL:%[a-z0-9]+]] = load double, double* @var64 -; CHECK: [[SINCOS:%[a-z0-9]+]] = call { double, double } @__sincospi_stret(double [[VAL]]) -; CHECK: extractvalue { double, double } [[SINCOS]], 0 -; CHECK: extractvalue { double, double } [[SINCOS]], 1 - -; CHECK-NO-SINCOS: call double @__sinpi -; CHECK-NO-SINCOS: call double @__cospi +; CHECK-FLOAT-IN-VEC-LABEL: @test_instbased_f64( +; CHECK-FLOAT-IN-VEC-NEXT: [[VAL:%.*]] = load double, double* @var64, align 8 +; CHECK-FLOAT-IN-VEC-NEXT: [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double [[VAL]]) +; CHECK-FLOAT-IN-VEC-NEXT: [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0 +; CHECK-FLOAT-IN-VEC-NEXT: [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1 +; CHECK-FLOAT-IN-VEC-NEXT: [[SIN:%.*]] = call double @__sinpi(double [[VAL]]) #[[ATTR0]] +; CHECK-FLOAT-IN-VEC-NEXT: [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]] +; CHECK-FLOAT-IN-VEC-NEXT: [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]] +; CHECK-FLOAT-IN-VEC-NEXT: ret double [[RES]] +; +; CHECK-LABEL: @test_instbased_f64( +; CHECK-NEXT: [[VAL:%.*]] = load double, double* @var64, align 8 +; CHECK-NEXT: [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double [[VAL]]) +; CHECK-NEXT: [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0 +; CHECK-NEXT: [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1 +; CHECK-NEXT: [[SIN:%.*]] = call double @__sinpi(double [[VAL]]) #[[ATTR0]] +; CHECK-NEXT: [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]] +; CHECK-NEXT: [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]] +; CHECK-NEXT: ret double [[RES]] +; +; CHECK-NO-SINCOS-LABEL: @test_instbased_f64( +; CHECK-NO-SINCOS-NEXT: [[VAL:%.*]] = load double, double* @var64, align 8 +; CHECK-NO-SINCOS-NEXT: [[SIN:%.*]] = call double @__sinpi(double [[VAL]]) #[[ATTR0]] +; CHECK-NO-SINCOS-NEXT: [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]] +; CHECK-NO-SINCOS-NEXT: [[RES:%.*]] = fadd double [[SIN]], [[COS]] +; CHECK-NO-SINCOS-NEXT: ret double [[RES]] +; + %val = load double, double* @var64 + %sin = call double @__sinpi(double %val) #0 + %cos = call double @__cospi(double %val) #0 + %res = fadd double %sin, %cos + ret double %res + + } define double @test_constant_f64() { - %sin = call double @__sinpi(double 1.0) #0 - %cos = call double @__cospi(double 1.0) #0 - %res = fadd double %sin, %cos - ret double %res -; CHECK-FLOAT-IN-VEC: [[SINCOS:%[a-z0-9]+]] = call { double, double } @__sincospi_stret(double 1.000000e+00) -; CHECK-FLOAT-IN-VEC: extractvalue { double, double } [[SINCOS]], 0 -; CHECK-FLOAT-IN-VEC: extractvalue { double, double } [[SINCOS]], 1 - -; CHECK: [[SINCOS:%[a-z0-9]+]] = call { double, double } @__sincospi_stret(double 1.000000e+00) -; CHECK: extractvalue { double, double } [[SINCOS]], 0 -; CHECK: extractvalue { double, double } [[SINCOS]], 1 - -; CHECK-NO-SINCOS: call double @__sinpi -; CHECK-NO-SINCOS: call double @__cospi +; CHECK-FLOAT-IN-VEC-LABEL: @test_constant_f64( +; CHECK-FLOAT-IN-VEC-NEXT: [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double 1.000000e+00) +; CHECK-FLOAT-IN-VEC-NEXT: [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0 +; CHECK-FLOAT-IN-VEC-NEXT: [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1 +; CHECK-FLOAT-IN-VEC-NEXT: [[SIN:%.*]] = call double @__sinpi(double 1.000000e+00) #[[ATTR0]] +; CHECK-FLOAT-IN-VEC-NEXT: [[COS:%.*]] = call double @__cospi(double 1.000000e+00) #[[ATTR0]] +; CHECK-FLOAT-IN-VEC-NEXT: [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]] +; CHECK-FLOAT-IN-VEC-NEXT: ret double [[RES]] +; +; CHECK-LABEL: @test_constant_f64( +; CHECK-NEXT: [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double 1.000000e+00) +; CHECK-NEXT: [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0 +; CHECK-NEXT: [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1 +; CHECK-NEXT: [[SIN:%.*]] = call double @__sinpi(double 1.000000e+00) #[[ATTR0]] +; CHECK-NEXT: [[COS:%.*]] = call double @__cospi(double 1.000000e+00) #[[ATTR0]] +; CHECK-NEXT: [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]] +; CHECK-NEXT: ret double [[RES]] +; +; CHECK-NO-SINCOS-LABEL: @test_constant_f64( +; CHECK-NO-SINCOS-NEXT: [[SIN:%.*]] = call double @__sinpi(double 1.000000e+00) #[[ATTR0]] +; CHECK-NO-SINCOS-NEXT: [[COS:%.*]] = call double @__cospi(double 1.000000e+00) #[[ATTR0]] +; CHECK-NO-SINCOS-NEXT: [[RES:%.*]] = fadd double [[SIN]], [[COS]] +; CHECK-NO-SINCOS-NEXT: ret double [[RES]] +; + %sin = call double @__sinpi(double 1.0) #0 + %cos = call double @__cospi(double 1.0) #0 + %res = fadd double %sin, %cos + ret double %res + + } define double @test_fptr(double (double)* %fptr, double %p1) { - %sin = call double @__sinpi(double %p1) #0 - %cos = call double %fptr(double %p1) - %res = fadd double %sin, %cos - ret double %res -; CHECK-LABEL: @test_fptr -; CHECK: __sinpi +; CHECK-FLOAT-IN-VEC-LABEL: @test_fptr( +; CHECK-FLOAT-IN-VEC-NEXT: [[SIN:%.*]] = call double @__sinpi(double [[P1:%.*]]) #[[ATTR0]] +; CHECK-FLOAT-IN-VEC-NEXT: [[COS:%.*]] = call double [[FPTR:%.*]](double [[P1]]) +; CHECK-FLOAT-IN-VEC-NEXT: [[RES:%.*]] = fadd double [[SIN]], [[COS]] +; CHECK-FLOAT-IN-VEC-NEXT: ret double [[RES]] +; +; CHECK-LABEL: @test_fptr( +; CHECK-NEXT: [[SIN:%.*]] = call double @__sinpi(double [[P1:%.*]]) #[[ATTR0]] +; CHECK-NEXT: [[COS:%.*]] = call double [[FPTR:%.*]](double [[P1]]) +; CHECK-NEXT: [[RES:%.*]] = fadd double [[SIN]], [[COS]] +; CHECK-NEXT: ret double [[RES]] +; +; CHECK-NO-SINCOS-LABEL: @test_fptr( +; CHECK-NO-SINCOS-NEXT: [[SIN:%.*]] = call double @__sinpi(double [[P1:%.*]]) #[[ATTR0]] +; CHECK-NO-SINCOS-NEXT: [[COS:%.*]] = call double [[FPTR:%.*]](double [[P1]]) +; CHECK-NO-SINCOS-NEXT: [[RES:%.*]] = fadd double [[SIN]], [[COS]] +; CHECK-NO-SINCOS-NEXT: ret double [[RES]] +; + %sin = call double @__sinpi(double %p1) #0 + %cos = call double %fptr(double %p1) + %res = fadd double %sin, %cos + ret double %res } diff --git a/llvm/test/Transforms/InstCombine/vec_extract_2elts.ll b/llvm/test/Transforms/InstCombine/vec_extract_2elts.ll index 5972340d60a91d..f359dfe00a3b8a 100644 --- a/llvm/test/Transforms/InstCombine/vec_extract_2elts.ll +++ b/llvm/test/Transforms/InstCombine/vec_extract_2elts.ll @@ -1,12 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instcombine -S | FileCheck %s define void @test(<4 x i32> %v, i64 *%r1, i64 *%r2) { -;CHECK: %1 = extractelement <4 x i32> %v, i32 0 -;CHECK: %2 = zext i32 %1 to i64 - %1 = zext <4 x i32> %v to <4 x i64> - %2 = extractelement <4 x i64> %1, i32 0 - store i64 %2, i64 *%r1 - store i64 %2, i64 *%r2 - ret void +; CHECK-LABEL: @test( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[V:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: store i64 [[TMP2]], i64* [[R1:%.*]], align 4 +; CHECK-NEXT: store i64 [[TMP2]], i64* [[R2:%.*]], align 4 +; CHECK-NEXT: ret void +; + %1 = zext <4 x i32> %v to <4 x i64> + %2 = extractelement <4 x i64> %1, i32 0 + store i64 %2, i64 *%r1 + store i64 %2, i64 *%r2 + ret void } diff --git a/llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll index e8c96d15bdb167..a792c6156240d3 100644 --- a/llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll @@ -1,9 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instcombine -S | FileCheck %s define void @f(i64 %val, i32 %limit, i32 *%ptr) { -; CHECK-LABEL: @f -; CHECK: %0 = trunc i64 %val to i32 -; CHECK: %1 = phi i32 [ %0, %entry ], [ {{.*}}, %loop ] +; CHECK-LABEL: @f( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[VAL:%.*]] to i32 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[END:%.*]] = icmp ult i32 [[TMP1]], [[LIMIT:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 10 +; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP3]] +; CHECK-NEXT: store i32 [[TMP2]], i32* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5]] = add i32 [[TMP1]], 16 +; CHECK-NEXT: br i1 [[END]], label [[LOOP]], label [[RET:%.*]] +; CHECK: ret: +; CHECK-NEXT: ret void +; entry: %tempvector = insertelement <16 x i64> poison, i64 %val, i32 0 %vector = shufflevector <16 x i64> %tempvector, <16 x i64> poison, <16 x i32> zeroinitializer @@ -27,9 +41,22 @@ ret: } define void @copy(i64 %val, i32 %limit, i32 *%ptr) { -; CHECK-LABEL: @copy -; CHECK: %0 = trunc i64 %val to i32 -; CHECK: %1 = phi i32 [ %0, %entry ], [ {{.*}}, %loop ] +; CHECK-LABEL: @copy( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[VAL:%.*]] to i32 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[END:%.*]] = icmp ult i32 [[TMP1]], [[LIMIT:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 10 +; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP3]] +; CHECK-NEXT: store i32 [[TMP2]], i32* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5]] = add i32 [[TMP1]], 16 +; CHECK-NEXT: br i1 [[END]], label [[LOOP]], label [[RET:%.*]] +; CHECK: ret: +; CHECK-NEXT: ret void +; entry: %tempvector = insertelement <16 x i64> poison, i64 %val, i32 0 %vector = shufflevector <16 x i64> %tempvector, <16 x i64> poison, <16 x i32> zeroinitializer @@ -54,9 +81,27 @@ ret: } define void @nocopy(i64 %val, i32 %limit, i32 *%ptr) { -; CHECK-LABEL: @nocopy -; CHECK-NOT: phi i32 -; CHECK: phi <16 x i32> [ %3, %entry ], [ %inc, %loop ] +; CHECK-LABEL: @nocopy( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[VAL:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> undef, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[TMP2]], +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[TMP4:%.*]] = phi <16 x i32> [ [[TMP3]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[ELT:%.*]] = extractelement <16 x i32> [[TMP4]], i32 0 +; CHECK-NEXT: [[ELTCOPY:%.*]] = extractelement <16 x i32> [[TMP4]], i32 1 +; CHECK-NEXT: [[END:%.*]] = icmp ult i32 [[ELT]], [[LIMIT:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[ELTCOPY]], 10 +; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[ELT]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP6]] +; CHECK-NEXT: store i32 [[TMP5]], i32* [[TMP7]], align 4 +; CHECK-NEXT: [[INC]] = add <16 x i32> [[TMP4]], +; CHECK-NEXT: br i1 [[END]], label [[LOOP]], label [[RET:%.*]] +; CHECK: ret: +; CHECK-NEXT: ret void +; entry: %tempvector = insertelement <16 x i64> poison, i64 %val, i32 0 %vector = shufflevector <16 x i64> %tempvector, <16 x i64> poison, <16 x i32> zeroinitializer @@ -81,8 +126,23 @@ ret: } define i1 @g(<3 x i32> %input_2) { -; CHECK-LABEL: @g -; CHECK: extractelement <3 x i32> %input_2, i32 0 +; CHECK-LABEL: @g( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i32> [[INPUT_2:%.*]], i32 0 +; CHECK-NEXT: br label [[FOR_COND:%.*]] +; CHECK: for.cond: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP4:%.*]], [[FOR_BODY:%.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[TMP3:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: br i1 undef, label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP2]], -1 +; CHECK-NEXT: [[SUB44_ELT:%.*]] = sub i32 0, [[TMP2]] +; CHECK-NEXT: [[TMP4]] = sdiv i32 [[TMP1]], [[SUB44_ELT]] +; CHECK-NEXT: br label [[FOR_COND]] +; CHECK: for.end: +; CHECK-NEXT: [[TOBOOL313:%.*]] = icmp eq i32 [[TMP1]], 0 +; CHECK-NEXT: ret i1 [[TOBOOL313]] +; entry: br label %for.cond @@ -91,7 +151,6 @@ for.cond: %input_1.addr.1 = phi <3 x i32> [ undef, %entry ], [ %dec43, %for.body ] br i1 undef, label %for.end, label %for.body -; CHECK-NOT: extractelement <3 x i32> %{{.*}}, i32 0 for.body: %dec43 = add <3 x i32> %input_1.addr.1, %sub44 = sub <3 x i32> , %dec43 diff --git a/llvm/test/Transforms/InstCombine/vec_phi_extract.ll b/llvm/test/Transforms/InstCombine/vec_phi_extract.ll index 15eb94aad6943a..3aacc0f8203526 100644 --- a/llvm/test/Transforms/InstCombine/vec_phi_extract.ll +++ b/llvm/test/Transforms/InstCombine/vec_phi_extract.ll @@ -1,9 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instcombine -S | FileCheck %s define void @f(i64 %val, i32 %limit, i32 *%ptr) { -; CHECK-LABEL: @f -; CHECK: %0 = trunc i64 %val to i32 -; CHECK: %1 = phi i32 [ %0, %entry ], [ {{.*}}, %loop ] +; CHECK-LABEL: @f( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[VAL:%.*]] to i32 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[END:%.*]] = icmp ult i32 [[TMP1]], [[LIMIT:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 10 +; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP3]] +; CHECK-NEXT: store i32 [[TMP2]], i32* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5]] = add i32 [[TMP1]], 16 +; CHECK-NEXT: br i1 [[END]], label [[LOOP]], label [[RET:%.*]] +; CHECK: ret: +; CHECK-NEXT: ret void +; entry: %tempvector = insertelement <16 x i64> undef, i64 %val, i32 0 %vector = shufflevector <16 x i64> %tempvector, <16 x i64> undef, <16 x i32> zeroinitializer @@ -27,9 +41,22 @@ ret: } define void @copy(i64 %val, i32 %limit, i32 *%ptr) { -; CHECK-LABEL: @copy -; CHECK: %0 = trunc i64 %val to i32 -; CHECK: %1 = phi i32 [ %0, %entry ], [ {{.*}}, %loop ] +; CHECK-LABEL: @copy( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[VAL:%.*]] to i32 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[END:%.*]] = icmp ult i32 [[TMP1]], [[LIMIT:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 10 +; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP3]] +; CHECK-NEXT: store i32 [[TMP2]], i32* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5]] = add i32 [[TMP1]], 16 +; CHECK-NEXT: br i1 [[END]], label [[LOOP]], label [[RET:%.*]] +; CHECK: ret: +; CHECK-NEXT: ret void +; entry: %tempvector = insertelement <16 x i64> undef, i64 %val, i32 0 %vector = shufflevector <16 x i64> %tempvector, <16 x i64> undef, <16 x i32> zeroinitializer @@ -54,9 +81,27 @@ ret: } define void @nocopy(i64 %val, i32 %limit, i32 *%ptr) { -; CHECK-LABEL: @nocopy -; CHECK-NOT: phi i32 -; CHECK: phi <16 x i32> [ %3, %entry ], [ %inc, %loop ] +; CHECK-LABEL: @nocopy( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[VAL:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> undef, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[TMP2]], +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[TMP4:%.*]] = phi <16 x i32> [ [[TMP3]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[ELT:%.*]] = extractelement <16 x i32> [[TMP4]], i32 0 +; CHECK-NEXT: [[ELTCOPY:%.*]] = extractelement <16 x i32> [[TMP4]], i32 1 +; CHECK-NEXT: [[END:%.*]] = icmp ult i32 [[ELT]], [[LIMIT:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[ELTCOPY]], 10 +; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[ELT]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP6]] +; CHECK-NEXT: store i32 [[TMP5]], i32* [[TMP7]], align 4 +; CHECK-NEXT: [[INC]] = add <16 x i32> [[TMP4]], +; CHECK-NEXT: br i1 [[END]], label [[LOOP]], label [[RET:%.*]] +; CHECK: ret: +; CHECK-NEXT: ret void +; entry: %tempvector = insertelement <16 x i64> undef, i64 %val, i32 0 %vector = shufflevector <16 x i64> %tempvector, <16 x i64> undef, <16 x i32> zeroinitializer @@ -81,8 +126,23 @@ ret: } define i1 @g(<3 x i32> %input_2) { -; CHECK-LABEL: @g -; CHECK: extractelement <3 x i32> %input_2, i32 0 +; CHECK-LABEL: @g( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x i32> [[INPUT_2:%.*]], i32 0 +; CHECK-NEXT: br label [[FOR_COND:%.*]] +; CHECK: for.cond: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP4:%.*]], [[FOR_BODY:%.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[TMP3:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: br i1 undef, label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP2]], -1 +; CHECK-NEXT: [[SUB44_ELT:%.*]] = sub i32 0, [[TMP2]] +; CHECK-NEXT: [[TMP4]] = sdiv i32 [[TMP1]], [[SUB44_ELT]] +; CHECK-NEXT: br label [[FOR_COND]] +; CHECK: for.end: +; CHECK-NEXT: [[TOBOOL313:%.*]] = icmp eq i32 [[TMP1]], 0 +; CHECK-NEXT: ret i1 [[TOBOOL313]] +; entry: br label %for.cond @@ -91,7 +151,6 @@ for.cond: %input_1.addr.1 = phi <3 x i32> [ undef, %entry ], [ %dec43, %for.body ] br i1 undef, label %for.end, label %for.body -; CHECK-NOT: extractelement <3 x i32> %{{.*}}, i32 0 for.body: %dec43 = add <3 x i32> %input_1.addr.1, %sub44 = sub <3 x i32> , %dec43 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll index 6c82f9dc4c6381..90f249fa18eab0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; REQUIRES: asserts ; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST ; RUN: opt < %s -loop-vectorize -force-vector-width=2 -instcombine -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s @@ -8,72 +9,79 @@ target triple = "aarch64--linux-gnu" ; This test checks that we correctly compute the scalarized operands for a ; user-specified vectorization factor when interleaving is disabled. We use the ; "optsize" attribute to disable all interleaving calculations. A cost of 4 -; for %tmp4 indicates that we would scalarize it's operand (%tmp3), giving -; %tmp4 a lower scalarization overhead. +; for %var4 indicates that we would scalarize it's operand (%var3), giving +; %var4 a lower scalarization overhead. ; ; COST-LABEL: predicated_udiv_scalarized_operand -; COST: LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3 +; COST: LV: Found an estimated cost of 4 for VF 2 For instruction: %var4 = udiv i64 %var2, %var3 ; +; +define i64 @predicated_udiv_scalarized_operand(i64* %a, i64 %x) optsize { ; CHECK-LABEL: @predicated_udiv_scalarized_operand( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %entry ], [ [[INDEX_NEXT:%.*]], %[[PRED_UDIV_CONTINUE2:.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %entry ], [ [[TMP17:%.*]], %[[PRED_UDIV_CONTINUE2]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, i64* %a, i64 [[INDEX]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP17:%.*]], [[PRED_UDIV_CONTINUE2]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 -; CHECK-NEXT: br i1 [[TMP3]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]] -; CHECK: [[PRED_UDIV_IF]]: +; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; CHECK: pred.udiv.if: ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], %x +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], [[X:%.*]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP6]], [[TMP5]] ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0 -; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE]] -; CHECK: [[PRED_UDIV_CONTINUE]]: -; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i64> [ poison, %vector.body ], [ [[TMP8]], %[[PRED_UDIV_IF]] ] +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE]] +; CHECK: pred.udiv.continue: +; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i64> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UDIV_IF]] ] ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 -; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2]] -; CHECK: [[PRED_UDIV_IF1]]: +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]] +; CHECK: pred.udiv.if1: ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = add nsw i64 [[TMP11]], %x +; CHECK-NEXT: [[TMP12:%.*]] = add nsw i64 [[TMP11]], [[X]] ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 ; CHECK-NEXT: [[TMP14:%.*]] = udiv i64 [[TMP13]], [[TMP12]] ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP14]], i32 1 -; CHECK-NEXT: br label %[[PRED_UDIV_CONTINUE2]] -; CHECK: [[PRED_UDIV_CONTINUE2]]: -; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i64> [ [[TMP9]], %[[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], %[[PRED_UDIV_IF1]] ] +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE2]] +; CHECK: pred.udiv.continue2: +; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i64> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], [[PRED_UDIV_IF1]] ] ; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP16]], <2 x i64> [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP17]] = add <2 x i64> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP17]]) +; CHECK-NEXT: ret i64 [[TMP19]] ; -define i64 @predicated_udiv_scalarized_operand(i64* %a, i64 %x) optsize { entry: br label %for.body for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] - %r = phi i64 [ 0, %entry ], [ %tmp6, %for.inc ] - %tmp0 = getelementptr inbounds i64, i64* %a, i64 %i - %tmp2 = load i64, i64* %tmp0, align 4 - %cond0 = icmp sgt i64 %tmp2, 0 + %r = phi i64 [ 0, %entry ], [ %var6, %for.inc ] + %var0 = getelementptr inbounds i64, i64* %a, i64 %i + %var2 = load i64, i64* %var0, align 4 + %cond0 = icmp sgt i64 %var2, 0 br i1 %cond0, label %if.then, label %for.inc if.then: - %tmp3 = add nsw i64 %tmp2, %x - %tmp4 = udiv i64 %tmp2, %tmp3 + %var3 = add nsw i64 %var2, %x + %var4 = udiv i64 %var2, %var3 br label %for.inc for.inc: - %tmp5 = phi i64 [ %tmp2, %for.body ], [ %tmp4, %if.then] - %tmp6 = add i64 %r, %tmp5 + %var5 = phi i64 [ %var2, %for.body ], [ %var4, %if.then] + %var6 = add i64 %r, %var5 %i.next = add nuw nsw i64 %i, 1 %cond1 = icmp slt i64 %i.next, 100 br i1 %cond1, label %for.body, label %for.end for.end: - %tmp7 = phi i64 [ %tmp6, %for.inc ] - ret i64 %tmp7 + %var7 = phi i64 [ %var6, %for.inc ] + ret i64 %var7 } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll index 98acd07df33849..a2912dc58c1dd5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll @@ -1,18 +1,62 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -scalable-vectorization=on -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S %s -o - | FileCheck %s define void @cond_inv_load_i32i32i16(i32* noalias nocapture %a, i32* noalias nocapture readonly %cond, i16* noalias nocapture readonly %inv, i64 %n) #0 { -; CHECK-LABEL: @cond_inv_load_i32i32i16 -; CHECK: vector.ph: -; CHECK: %[[INVINS:.*]] = insertelement poison, i16* %inv, i32 0 -; CHECK: %[[INVSPLAT:.*]] = shufflevector %[[INVINS]], poison, zeroinitializer -; CHECK: vector.body: -; CHECK: %[[GEPCOND:.*]] = getelementptr inbounds i32, i32* %cond, i64 %index -; CHECK-NEXT: %[[GEPCOND2:.*]] = bitcast i32* %[[GEPCOND]] to * -; CHECK-NEXT: %[[CONDVALS:.*]] = load , * %[[GEPCOND2]], align 4 -; CHECK-NEXT: %[[MASK:.*]] = icmp ne %[[CONDVALS]], -; CHECK-NEXT: %[[GATHERLOAD:.*]] = call @llvm.masked.gather.nxv4i16.nxv4p0i16( %[[INVSPLAT]], i32 2, %[[MASK]], undef) -; CHECK-NEXT: %[[GATHERLOAD2:.*]] = sext %[[GATHERLOAD]] to -; CHECK: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[GATHERLOAD2]] +; CHECK-LABEL: @cond_inv_load_i32i32i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16* [[INV:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i16.nxv4p0i16( [[BROADCAST_SPLAT]], i32 2, [[TMP6]], undef) +; CHECK-NEXT: [[TMP7:%.*]] = sext [[WIDE_MASKED_GATHER]] to +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP7]], * [[TMP9]], i32 4, [[TMP6]]) +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[I_07]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP13]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[INV]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_07]] +; CHECK-NEXT: store i32 [[CONV]], i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -40,17 +84,59 @@ exit: ; preds = %for.inc } define void @cond_inv_load_f64f64f64(double* noalias nocapture %a, double* noalias nocapture readonly %cond, double* noalias nocapture readonly %inv, i64 %n) #0 { -; CHECK-LABEL: @cond_inv_load_f64f64f64 -; CHECK: vector.ph: -; CHECK: %[[INVINS:.*]] = insertelement poison, double* %inv, i32 0 -; CHECK: %[[INVSPLAT:.*]] = shufflevector %[[INVINS]], poison, zeroinitializer -; CHECK: vector.body: -; CHECK: %[[GEPCOND:.*]] = getelementptr inbounds double, double* %cond, i64 %index -; CHECK-NEXT: %[[GEPCOND2:.*]] = bitcast double* %[[GEPCOND]] to * -; CHECK-NEXT: %[[CONDVALS:.*]] = load , * %[[GEPCOND2]], align 8 -; CHECK-NEXT: %[[MASK:.*]] = fcmp ogt %[[CONDVALS]], -; CHECK-NEXT: %[[GATHERLOAD:.*]] = call @llvm.masked.gather.nxv4f64.nxv4p0f64( %[[INVSPLAT]], i32 8, %[[MASK]], undef) -; CHECK: call void @llvm.masked.store.nxv4f64.p0nxv4f64( %[[GATHERLOAD]] +; CHECK-LABEL: @cond_inv_load_f64f64f64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double* [[INV:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[COND:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = fcmp ogt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, double 4.000000e-01, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f64.nxv4p0f64( [[BROADCAST_SPLAT]], i32 8, [[TMP6]], undef) +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4f64.p0nxv4f64( [[WIDE_MASKED_GATHER]], * [[TMP8]], i32 8, [[TMP6]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[COND]], i64 [[I_08]] +; CHECK-NEXT: [[TMP12:%.*]] = load double, double* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt double [[TMP12]], 4.000000e-01 +; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK: if.then: +; CHECK-NEXT: [[TMP13:%.*]] = load double, double* [[INV]], align 8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[I_08]] +; CHECK-NEXT: store double [[TMP13]], double* [[ARRAYIDX2]], align 8 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -77,17 +163,68 @@ exit: ; preds = %for.inc } define void @invariant_load_cond(i32* noalias nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %cond, i64 %n) #0 { -; CHECK-LABEL: @invariant_load_cond -; CHECK: vector.body -; CHECK: %[[GEP:.*]] = getelementptr inbounds i32, i32* %b, i64 42 -; CHECK-NEXT: %[[SPLATINS:.*]] = insertelement poison, i32* %[[GEP]], i32 0 -; CHECK-NEXT: %[[SPLAT:.*]] = shufflevector %[[SPLATINS]], poison, zeroinitializer -; CHECK: %[[LOAD:.*]] = load , * -; CHECK-NEXT: %[[ICMP:.*]] = icmp ne %[[LOAD]], zeroinitializer -; CHECK: %[[MASKED_LOAD:.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* %[[BITCAST:.*]], i32 4, %[[ICMP]], poison) -; CHECK-NEXT: %[[MASKED_GATHER:.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( %[[SPLAT]], i32 4, %[[ICMP]], undef) -; CHECK-NEXT: %[[ADD:.*]] = add nsw %[[MASKED_GATHER]], %[[MASKED_LOAD]] -; CHECK: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[ADD]], * %[[BITCAST1:.*]], i32 4, %[[ICMP]]) +; CHECK-LABEL: @invariant_load_cond( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 42 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32* [[TMP4]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP9]], i32 4, [[TMP7]], poison) +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[DOTSPLAT]], i32 4, [[TMP7]], undef) +; CHECK-NEXT: [[TMP10:%.*]] = add nsw [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP10]], * [[TMP12]], i32 4, [[TMP7]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[IV]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP16]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 42 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP17]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll index e5bbcd881163fb..70b665d82765ba 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll @@ -1,12 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S %s -scalable-vectorization=preferred -force-target-instruction-cost=1 -o - | FileCheck %s define void @gather_nxv4i32_ind64(float* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, float* noalias nocapture %c, i64 %n) #0 { -; CHECK-LABEL: @gather_nxv4i32_ind64 -; CHECK: vector.body: -; CHECK: %[[IND:.*]] = load , * -; CHECK: %[[PTRS:.*]] = getelementptr inbounds float, float* %a, %[[IND]] -; CHECK: %[[GLOAD:.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( %[[PTRS]] -; CHECK: store %[[GLOAD]], * +; CHECK-LABEL: @gather_nxv4i32_ind64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A:%.*]], [[WIDE_LOAD]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to * +; CHECK-NEXT: store [[WIDE_MASKED_GATHER]], * [[TMP8]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP12:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[TMP13]], float* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -30,13 +72,55 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo ; additional 'sext' in the for.body loop exposes additional code paths ; during vectorisation. define void @scatter_nxv4i32_ind32(float* noalias nocapture %a, i32* noalias nocapture readonly %b, float* noalias nocapture readonly %c, i64 %n) #0 { -; CHECK-LABEL: @scatter_nxv4i32_ind32 -; CHECK: vector.body: -; CHECK: %[[VALS:.*]] = load -; CHECK: %[[IND:.*]] = load , * %7, align 4 -; CHECK: %[[EXTIND:.*]] = sext %[[IND]] to -; CHECK: %[[PTRS:.*]] = getelementptr inbounds float, float* %a, %[[EXTIND]] -; CHECK: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32( %[[VALS]], %[[PTRS]] +; CHECK-LABEL: @scatter_nxv4i32_ind32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to * +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load , * [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[A:%.*]], [[TMP8]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32( [[WIDE_LOAD]], [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[IDXPROM4:%.*]] = sext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IDXPROM4]] +; CHECK-NEXT: store float [[TMP13]], float* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -58,14 +142,54 @@ for.cond.cleanup: ; preds = %for.body, %entry } define void @scatter_inv_nxv4i32(i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) #0 { -; CHECK-LABEL: @scatter_inv_nxv4i32 -; CHECK: vector.ph: -; CHECK: %[[INS:.*]] = insertelement poison, i32* %inv, i32 0 -; CHECK: %[[PTRSPLAT:.*]] = shufflevector %[[INS]], poison, zeroinitializer -; CHECK: vector.body: -; CHECK: %[[VALS:.*]] = load , * %5, align 4 -; CHECK: %[[MASK:.*]] = icmp ne %[[VALS]], -; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32({{.*}}, %[[PTRSPLAT]], i32 4, %[[MASK]]) +; CHECK-LABEL: @scatter_inv_nxv4i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32* [[INV:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer), [[BROADCAST_SPLAT]], i32 4, [[TMP6]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP10]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: store i32 3, i32* [[INV]], align 4 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -90,14 +214,57 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo } define void @gather_inv_nxv4i32(i32* noalias nocapture %a, i32* noalias nocapture readonly %inv, i64 %n) #0 { -; CHECK-LABEL: @gather_inv_nxv4i32 -; CHECK: vector.ph: -; CHECK: %[[INS:.*]] = insertelement poison, i32* %inv, i32 0 -; CHECK: %[[PTRSPLAT:.*]] = shufflevector %[[INS]], poison, zeroinitializer -; CHECK: vector.body: -; CHECK: %[[VALS:.*]] = load , * %5, align 4 -; CHECK: %[[MASK:.*]] = icmp sgt %[[VALS]], -; CHECK: %{{.*}} = call @llvm.masked.gather.nxv4i32.nxv4p0i32( %[[PTRSPLAT]], i32 4, %[[MASK]] +; CHECK-LABEL: @gather_inv_nxv4i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32* [[INV:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp sgt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT]], i32 4, [[TMP6]], undef) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP4]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[WIDE_MASKED_GATHER]], * [[TMP7]], i32 4, [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[TMP11]], 3 +; CHECK-NEXT: br i1 [[CMP2]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK: if.then: +; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[INV]], align 4 +; CHECK-NEXT: store i32 [[TMP12]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -125,16 +292,69 @@ for.cond.cleanup: ; preds = %for.inc, %entry define void @gather_nxv4i32_ind64_stride2(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 { -; CHECK-LABEL: @gather_nxv4i32_ind64_stride2 -; CHECK: vector.body: -; CHECK: %[[IDX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ] -; CHECK-DAG: %[[STEP:.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-DAG: %[[IDXSPLATINS:.*]] = insertelement poison, i64 %[[IDX]], i32 0 -; CHECK-DAG: %[[IDXSPLAT:.*]] = shufflevector %[[IDXSPLATINS]], poison, zeroinitializer -; CHECK: %[[ADD:.*]] = add %[[IDXSPLAT]], %[[STEP]] -; CHECK: %[[MUL:.*]] = shl %[[ADD]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) -; CHECK: %[[PTRS:.*]] = getelementptr inbounds float, float* %b, %[[MUL]] -; CHECK: call @llvm.masked.gather.nxv4f32.nxv4p0f32( %[[PTRS]] +; CHECK-LABEL: @gather_nxv4i32_ind64_stride2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement poison, i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector [[DOTSPLATINSERT2]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = add [[DOTSPLAT3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement poison, i64 [[TMP7]], i32 0 +; CHECK-NEXT: [[DOTSPLAT5:%.*]] = shufflevector [[DOTSPLATINSERT4]], poison, zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = add [[DOTSPLAT5]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = add [[DOTSPLAT3]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = shl [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = shl [[TMP9]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[B:%.*]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[B]], [[TMP11]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to * +; CHECK-NEXT: store [[WIDE_MASKED_GATHER]], * [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i32 [[TMP16]], 2 +; CHECK-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP19]] to * +; CHECK-NEXT: store [[WIDE_MASKED_GATHER6]], * [[TMP20]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[TMP21]], 3 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDVARS_IV_STRIDE2:%.*]] = shl i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_STRIDE2]] +; CHECK-NEXT: [[TMP24:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[TMP24]], float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll index e5af52c5784781..19c8b1525bc91d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -scalable-vectorization=on -force-target-instruction-cost=1 -dce -instcombine < %s -S | FileCheck %s target triple = "aarch64-linux-gnu" @@ -11,15 +12,59 @@ target triple = "aarch64-linux-gnu" define void @cond_ind64(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) #0 { ; CHECK-LABEL: @cond_ind64( -; CHECK: vector.body: -; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ] -; CHECK: %[[STEPVEC:.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: %[[TMP1:.*]] = insertelement poison, i64 %[[INDEX]], i32 0 -; CHECK-NEXT: %[[IDXSPLT:.*]] = shufflevector %[[TMP1]], poison, zeroinitializer -; CHECK-NEXT: %[[VECIND:.*]] = add %[[IDXSPLT]], %[[STEPVEC]] -; CHECK-NEXT: %[[MASK:.*]] = trunc %[[VECIND]] to -; CHECK: %[[LOAD:.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* %{{.*}}, i32 4, %[[MASK]], poison) -; CHECK: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[LOAD]], * %{{.*}}, i32 4, %[[MASK]]) +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement poison, i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector [[DOTSPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = add [[DOTSPLAT2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = trunc [[TMP5]] to +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to * +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP8]], i32 4, [[TMP6]], poison) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[WIDE_MASKED_LOAD]], * [[TMP10]], i32 4, [[TMP6]]) +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = shl i64 [[TMP11]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[AND:%.*]] = and i64 [[I_08]], 1 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i64 [[AND]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_08]] +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_08]] +; CHECK-NEXT: store i32 [[TMP14]], i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll index a11ac0e87b6284..3aca292ebd3c73 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -scalable-vectorization=on -dce -instcombine -S < %s | FileCheck %s ; Ensure that we can vectorize loops such as: @@ -14,9 +15,23 @@ define void @widen_ptr_phi_unrolled(i32* noalias nocapture %a, i32* noalias nocapture %b, i32* nocapture readonly %c, i64 %n) #0 { ; CHECK-LABEL: @widen_ptr_phi_unrolled( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i32, i32* [[C:%.*]], i64 [[TMP4]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i32* [ %c, %vector.ph ], [ %[[PTR_IND:.*]], %vector.body ] -; CHECK: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i32* [ [[C]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2 ; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP5]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.experimental.stepvector.nxv4i64() @@ -30,11 +45,64 @@ define void @widen_ptr_phi_unrolled(i32* noalias nocapture %a, i32* noalias noca ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], [[VECTOR_GEP4]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, [[TMP9]], i64 1 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, [[TMP12]], i64 1 -; CHECK-NEXT: {{%.*}} = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP9]], -; CHECK-NEXT: {{%.*}} = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP12]], -; CHECK-NEXT: {{%.*}} = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP13]], -; CHECK-NEXT: {{%.*}} = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP14]], -; CHECK: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP7]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP15:%.*]] = add nsw [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP16:%.*]] = add nsw [[WIDE_MASKED_GATHER5]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to * +; CHECK-NEXT: store [[TMP15]], * [[TMP18]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw i32 [[TMP19]], 2 +; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP20]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[TMP22]] to * +; CHECK-NEXT: store [[TMP16]], * [[TMP23]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = add nsw [[WIDE_MASKED_GATHER6]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP25:%.*]] = add nsw [[WIDE_MASKED_GATHER7]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to * +; CHECK-NEXT: store [[TMP24]], * [[TMP27]], align 4 +; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP29:%.*]] = shl nuw nsw i32 [[TMP28]], 2 +; CHECK-NEXT: [[TMP30:%.*]] = zext i32 [[TMP29]] to i64 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[TMP26]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to * +; CHECK-NEXT: store [[TMP25]], * [[TMP32]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP34:%.*]] = shl nuw nsw i64 [[TMP33]], 3 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP34]] +; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP7]] +; CHECK-NEXT: br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[C]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[PTR_014:%.*]] = phi i32* [ [[INCDEC_PTR1:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[PTR_014]], i64 1 +; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[PTR_014]], align 4 +; CHECK-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds i32, i32* [[PTR_014]], i64 2 +; CHECK-NEXT: [[TMP37:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP36]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_013]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP37]], 1 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_013]] +; CHECK-NEXT: store i32 [[ADD2]], i32* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_013]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.exit: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -73,27 +141,69 @@ for.exit: ; preds = %for.body define void @widen_2ptrs_phi_unrolled(i32* noalias nocapture %dst, i32* noalias nocapture readonly %src, i64 %n) #0 { ; CHECK-LABEL: @widen_2ptrs_phi_unrolled( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[N_VEC]] +; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[N_VEC]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: %[[IDX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ] -; CHECK-NEXT: %[[LGEP1:.*]] = getelementptr i32, i32* %src, i64 %[[IDX]] -; CHECK-NEXT: %[[SGEP1:.*]] = getelementptr i32, i32* %dst, i64 %[[IDX]] -; CHECK-NEXT: %[[LPTR1:.*]] = bitcast i32* %[[LGEP1]] to * -; CHECK-NEXT: %{{.*}} = load , * %[[LPTR1]], align 4 -; CHECK-NEXT: %[[VSCALE1:.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: %[[TMP1:.*]] = shl nuw nsw i32 %[[VSCALE1]], 2 -; CHECK-NEXT: %[[TMP2:.*]] = zext i32 %[[TMP1]] to i64 -; CHECK-NEXT: %[[LGEP2:.*]] = getelementptr i32, i32* %[[LGEP1]], i64 %[[TMP2]] -; CHECK-NEXT: %[[LPTR2:.*]] = bitcast i32* %[[LGEP2]] to * -; CHECK-NEXT: %{{.*}} = load , * %[[LPTR2]], align 4 -; CHECK: %[[SPTR1:.*]] = bitcast i32* %[[SGEP1]] to * -; CHECK-NEXT: store %{{.*}}, * %[[SPTR1]], align 4 -; CHECK-NEXT: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: %[[TMP3:.*]] = shl nuw nsw i32 %[[VSCALE2]], 2 -; CHECK-NEXT: %[[TMP4:.*]] = zext i32 %[[TMP3]] to i64 -; CHECK-NEXT: %[[SGEP2:.*]] = getelementptr i32, i32* %[[SGEP1]], i64 %[[TMP4]] -; CHECK-NEXT: %[[SPTR2:.*]] = bitcast i32* %[[SGEP2]] to * -; CHECK-NEXT: store %{{.*}}, * %[[SPTR2]], align 4 - +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i32, i32* [[DST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[NEXT_GEP]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP5]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[NEXT_GEP]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to * +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load , * [[TMP9]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = shl nsw [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = shl nsw [[WIDE_LOAD7]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[NEXT_GEP5]] to * +; CHECK-NEXT: store [[TMP10]], * [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i32 [[TMP13]], 2 +; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[NEXT_GEP5]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to * +; CHECK-NEXT: store [[TMP11]], * [[TMP17]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[TMP18]], 3 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32* [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[DST]], [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[S_010:%.*]] = phi i32* [ [[INCDEC_PTR1:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[D_09:%.*]] = phi i32* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[S_010]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP21]], 1 +; CHECK-NEXT: store i32 [[MUL]], i32* [[D_09]], align 4 +; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[D_09]], i64 1 +; CHECK-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds i32, i32* [[S_010]], i64 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_011]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -124,15 +234,29 @@ for.cond.cleanup: ; preds = %for.body define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) #0 { ; CHECK-LABEL: @pointer_iv_mixed( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[N_VEC]] +; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i32*, i32** [[B:%.*]], i64 [[N_VEC]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i32* [ %a, %vector.ph ], [ [[PTR_IND:%.*]], %vector.body ] -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 0, i32 0), %vector.ph ], [ [[TMP9:%.*]], %vector.body ] +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i32* [ [[A]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 0, i32 0), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv2i64() ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], [[TMP6]] -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32*, i32** %b, i64 [[INDEX]] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32*, i32** [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[BC:%.*]] = bitcast [[TMP7]] to *> ; CHECK-NEXT: [[TMP8:%.*]] = extractelement *> [[BC]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP8]], align 8 @@ -142,36 +266,72 @@ define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) #0 { ; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}} +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP5]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[TMP9]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32** [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[P:%.*]] = phi i32* [ [[VAR3:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[Q:%.*]] = phi i32** [ [[VAR4:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[VAR0:%.*]] = phi i32 [ [[VAR2:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[VAR1:%.*]] = load i32, i32* [[P]], align 8 +; CHECK-NEXT: [[VAR2]] = add i32 [[VAR1]], [[VAR0]] +; CHECK-NEXT: store i32* [[P]], i32** [[Q]], align 8 +; CHECK-NEXT: [[VAR3]] = getelementptr inbounds i32, i32* [[P]], i64 1 +; CHECK-NEXT: [[VAR4]] = getelementptr inbounds i32*, i32** [[Q]], i64 1 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR2]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[VAR5]] +; entry: br label %for.body for.body: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] - %p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ] - %q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ] - %tmp0 = phi i32 [ %tmp2, %for.body ], [ 0, %entry ] - %tmp1 = load i32, i32* %p, align 8 - %tmp2 = add i32 %tmp1, %tmp0 + %p = phi i32* [ %var3, %for.body ], [ %a, %entry ] + %q = phi i32** [ %var4, %for.body ], [ %b, %entry ] + %var0 = phi i32 [ %var2, %for.body ], [ 0, %entry ] + %var1 = load i32, i32* %p, align 8 + %var2 = add i32 %var1, %var0 store i32* %p, i32** %q, align 8 - %tmp3 = getelementptr inbounds i32, i32* %p, i32 1 - %tmp4 = getelementptr inbounds i32*, i32** %q, i32 1 + %var3 = getelementptr inbounds i32, i32* %p, i32 1 + %var4 = getelementptr inbounds i32*, i32** %q, i32 1 %i.next = add nuw nsw i64 %i, 1 %cond = icmp slt i64 %i.next, %n br i1 %cond, label %for.body, label %for.end, !llvm.loop !6 for.end: - %tmp5 = phi i32 [ %tmp2, %for.body ] - ret i32 %tmp5 + %var5 = phi i32 [ %var2, %for.body ] + ret i32 %var5 } define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(i16* %ptr) #0 { ; CHECK-LABEL: @phi_used_in_vector_compare_and_scalar_indvar_update_and_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i16, i16* [[PTR:%.*]], i64 [[N_VEC]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i16* [ %ptr, %vector.ph ], [ [[PTR_IND:%.*]], %vector.body ] -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i16* [ [[PTR]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() @@ -183,9 +343,32 @@ define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(i16* ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{.*}} +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i16, i16* [[POINTER_PHI]], i64 [[TMP3]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i16* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[IV_PTR:%.*]] = phi i16* [ [[INCDEC_IV_PTR:%.*]], [[IF_END]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[CMP_I_NOT:%.*]] = icmp eq i16* [[IV_PTR]], null +; CHECK-NEXT: br i1 [[CMP_I_NOT]], label [[IF_END]], label [[IF_END_SINK_SPLIT:%.*]] +; CHECK: if.end.sink.split: +; CHECK-NEXT: store i16 0, i16* [[IV_PTR]], align 2 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[INCDEC_IV_PTR]] = getelementptr inbounds i16, i16* [[IV_PTR]], i64 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp ult i64 [[IV]], 1023 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll index 5c7bd79d751fe8..012106fa50e52d 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -dce \ ; RUN: -instcombine -force-vector-width=2 < %s | FileCheck %s ; @@ -7,15 +8,32 @@ ; Check that the addresses for a scalarized memory access is not extracted ; from a vector register. define i32 @foo(i32* nocapture %A) { -;CHECK-LABEL: @foo( -;CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -;CHECK: %0 = shl nsw i64 %index, 2 -;CHECK: %1 = shl i64 %index, 2 -;CHECK: %2 = or i64 %1, 4 -;CHECK: %3 = getelementptr inbounds i32, i32* %A, i64 %0 -;CHECK: %4 = getelementptr inbounds i32, i32* %A, i64 %2 -;CHECK: store i32 4, i32* %3, align 4 -;CHECK: store i32 4, i32* %4, align 4 +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]] +; CHECK-NEXT: store i32 4, i32* [[TMP3]], align 4 +; CHECK-NEXT: store i32 4, i32* [[TMP4]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 undef +; entry: br label %for.body @@ -37,20 +55,37 @@ for.end: ; Check that a load of address is scalarized. define i32 @foo1(i32* nocapture noalias %A, i32** nocapture %PtrPtr) { -;CHECK-LABEL: @foo1( -;CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -;CHECK: %0 = or i64 %index, 1 -;CHECK: %1 = getelementptr inbounds i32*, i32** %PtrPtr, i64 %index -;CHECK: %2 = getelementptr inbounds i32*, i32** %PtrPtr, i64 %0 -;CHECK: %3 = load i32*, i32** %1, align 8 -;CHECK: %4 = load i32*, i32** %2, align 8 -;CHECK: %5 = load i32, i32* %3, align 4 -;CHECK: %6 = load i32, i32* %4, align 4 -;CHECK: %7 = insertelement <2 x i32> poison, i32 %5, i32 0 -;CHECK: %8 = insertelement <2 x i32> %7, i32 %6, i32 1 -;CHECK: %9 = getelementptr inbounds i32, i32* %A, i64 %index -;CHECK: %10 = bitcast i32* %9 to <2 x i32>* -;CHECK: store <2 x i32> %8, <2 x i32>* %10, align 4 +; CHECK-LABEL: @foo1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32*, i32** [[PTRPTR:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32*, i32** [[PTRPTR]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32*, i32** [[TMP1]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP2]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP8]], <2 x i32>* [[TMP10]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 undef +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll b/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll index e9a4541c01df1c..8966f7caec98a6 100644 --- a/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll +++ b/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll @@ -1,12 +1,49 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -dce -instcombine -S -force-vector-width=4 < %s 2>%t | FileCheck %s define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) { -; CHECK-LABEL: @inv_store_last_lane -; CHECK: vector.body: -; CHECK: store <4 x i32> %[[VEC_VAL:.*]], < -; CHECK: middle.block: -; CHECK: %{{.*}} = extractelement <4 x i32> %[[VEC_VAL]], i32 3 - +; CHECK-LABEL: @inv_store_last_lane( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP7]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[INV:%.*]], i64 42 +; CHECK-NEXT: store i32 [[MUL_LCSSA]], i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -28,12 +65,46 @@ exit: ; preds = %for.body } define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) { -; CHECK-LABEL: @ret_last_lane -; CHECK: vector.body: -; CHECK: store <4 x float> %[[VEC_VAL:.*]], < -; CHECK: middle.block: -; CHECK: %{{.*}} = extractelement <4 x float> %[[VEC_VAL]], i32 3 - +; CHECK-LABEL: @ret_last_lane( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x float> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP4]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP7]], 2.000000e+00 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[MUL]], float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret float [[MUL_LCSSA]] +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll index 803ee72dea9792..dd1e9889af3811 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s ; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s @@ -8,27 +9,72 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" ; contains a conditional store. The store group contains gaps and is not ; vectorized. ; -; CHECK-LABEL: @interleaved_with_cond_store_0( ; -; CHECK: vector.ph -; CHECK: %n.mod.vf = and i64 %[[N:.+]], 1 -; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 -; CHECK: %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf -; CHECK: %n.vec = sub nsw i64 %[[N]], %[[R]] ; -; CHECK: vector.body: -; CHECK: %wide.vec = load <4 x i64>, <4 x i64>* %{{.*}} -; CHECK: %strided.vec = shufflevector <4 x i64> %wide.vec, <4 x i64> poison, <2 x i32> ; -; CHECK: pred.store.if -; CHECK: %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0 -; CHECK: store i64 %[[X1]], {{.*}} ; -; CHECK: pred.store.if -; CHECK: %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2 -; CHECK: store i64 %[[X2]], {{.*}} define void @interleaved_with_cond_store_0(%pair *%p, i64 %x, i64 %n) { +; CHECK-LABEL: @interleaved_with_cond_store_0( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 3 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 2, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[PAIR:%.*]], %pair* [[P:%.*]], i64 [[INDEX]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[TMP2]] to <4 x i64>* +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, <4 x i64>* [[TMP3]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i64> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i32 0 +; CHECK-NEXT: store i64 [[TMP6]], i64* [[TMP2]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] +; CHECK: pred.store.if1: +; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i32 2 +; CHECK-NEXT: store i64 [[TMP10]], i64* [[TMP9]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] +; CHECK: pred.store.continue2: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[IF_MERGE:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[P_1:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[I]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = load i64, i64* [[P_1]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[TMP12]], [[X]] +; CHECK-NEXT: br i1 [[TMP13]], label [[IF_THEN:%.*]], label [[IF_MERGE]] +; CHECK: if.then: +; CHECK-NEXT: store i64 [[TMP12]], i64* [[P_1]], align 8 +; CHECK-NEXT: br label [[IF_MERGE]] +; CHECK: if.merge: +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -57,34 +103,84 @@ for.end: ; groups are separately vectorized. The store group contains gaps and is not ; vectorized. ; -; CHECK-LABEL: @interleaved_with_cond_store_1( ; -; CHECK: vector.ph -; CHECK: %n.mod.vf = and i64 %[[N:.+]], 1 -; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 -; CHECK: %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf -; CHECK: %n.vec = sub nsw i64 %[[N]], %[[R]] ; -; CHECK: vector.body: -; CHECK: %[[L1:.+]] = load <4 x i64>, <4 x i64>* %{{.*}} -; CHECK: %strided.vec = shufflevector <4 x i64> %[[L1]], <4 x i64> poison, <2 x i32> ; -; CHECK: pred.store.if -; CHECK: %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0 -; CHECK: store i64 %[[X1]], {{.*}} ; -; CHECK: pred.store.if -; CHECK: %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2 -; CHECK: store i64 %[[X2]], {{.*}} ; -; CHECK: pred.store.continue -; CHECK: %[[L2:.+]] = load <4 x i64>, <4 x i64>* {{.*}} -; CHECK: %[[X3:.+]] = extractelement <4 x i64> %[[L2]], i32 0 -; CHECK: store i64 %[[X3]], {{.*}} -; CHECK: %[[X4:.+]] = extractelement <4 x i64> %[[L2]], i32 2 -; CHECK: store i64 %[[X4]], {{.*}} define void @interleaved_with_cond_store_1(%pair *%p, i64 %x, i64 %n) { +; CHECK-LABEL: @interleaved_with_cond_store_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 3 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 2, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PAIR:%.*]], %pair* [[P:%.*]], i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[INDEX]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[TMP4]] to <4 x i64>* +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, <4 x i64>* [[TMP6]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <2 x i64> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i32 0 +; CHECK-NEXT: store i64 [[TMP9]], i64* [[TMP3]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] +; CHECK: pred.store.if1: +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i32 2 +; CHECK-NEXT: store i64 [[TMP12]], i64* [[TMP11]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] +; CHECK: pred.store.continue2: +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP3]] to <4 x i64>* +; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP13]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i32 0 +; CHECK-NEXT: store i64 [[TMP14]], i64* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i32 2 +; CHECK-NEXT: store i64 [[TMP15]], i64* [[TMP5]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[IF_MERGE:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[P_0:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[I]], i32 0 +; CHECK-NEXT: [[P_1:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[I]], i32 1 +; CHECK-NEXT: [[TMP17:%.*]] = load i64, i64* [[P_1]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP17]], [[X]] +; CHECK-NEXT: br i1 [[TMP18]], label [[IF_THEN:%.*]], label [[IF_MERGE]] +; CHECK: if.then: +; CHECK-NEXT: store i64 [[TMP17]], i64* [[P_0]], align 8 +; CHECK-NEXT: br label [[IF_MERGE]] +; CHECK: if.merge: +; CHECK-NEXT: [[TMP19:%.*]] = load i64, i64* [[P_0]], align 8 +; CHECK-NEXT: store i64 [[TMP19]], i64* [[P_1]], align 8 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -116,29 +212,78 @@ for.end: ; predicated block. The load group is vectorized, and the store groups contain ; gaps and are not vectorized. ; -; CHECK-LABEL: @interleaved_with_cond_store_2( ; -; CHECK: vector.ph -; CHECK: %n.mod.vf = and i64 %[[N:.+]], 1 -; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 -; CHECK: %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf -; CHECK: %n.vec = sub nsw i64 %[[N]], %[[R]] ; -; CHECK: vector.body: -; CHECK: %[[L1:.+]] = load <4 x i64>, <4 x i64>* %{{.*}} -; CHECK: %strided.vec = shufflevector <4 x i64> %[[L1]], <4 x i64> poison, <2 x i32> -; CHECK: store i64 %x, {{.*}} -; CHECK: store i64 %x, {{.*}} ; -; CHECK: pred.store.if -; CHECK: %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0 -; CHECK: store i64 %[[X1]], {{.*}} ; -; CHECK: pred.store.if -; CHECK: %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2 -; CHECK: store i64 %[[X2]], {{.*}} define void @interleaved_with_cond_store_2(%pair *%p, i64 %x, i64 %n) { +; CHECK-LABEL: @interleaved_with_cond_store_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 3 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 2, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PAIR:%.*]], %pair* [[P:%.*]], i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[INDEX]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <4 x i64>* +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, <4 x i64>* [[TMP6]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: store i64 [[X]], i64* [[TMP3]], align 8 +; CHECK-NEXT: store i64 [[X]], i64* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <2 x i64> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i32 0 +; CHECK-NEXT: store i64 [[TMP9]], i64* [[TMP5]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] +; CHECK: pred.store.if1: +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[WIDE_VEC]], i32 2 +; CHECK-NEXT: store i64 [[TMP12]], i64* [[TMP11]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] +; CHECK: pred.store.continue2: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[IF_MERGE:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[P_0:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[I]], i32 0 +; CHECK-NEXT: [[P_1:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P]], i64 [[I]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = load i64, i64* [[P_1]], align 8 +; CHECK-NEXT: store i64 [[X]], i64* [[P_0]], align 8 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[TMP14]], [[X]] +; CHECK-NEXT: br i1 [[TMP15]], label [[IF_THEN:%.*]], label [[IF_MERGE]] +; CHECK: if.then: +; CHECK-NEXT: store i64 [[TMP14]], i64* [[P_1]], align 8 +; CHECK-NEXT: br label [[IF_MERGE]] +; CHECK: if.merge: +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll index fd1e909066844b..0688808104c7fb 100644 --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -licm -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s ; First licm pass is to hoist/sink invariant stores if possible. Today LICM does @@ -12,23 +13,64 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; address. -; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction( ; memory check is found.conflict = b[max(n-1,1)] > a && (i8* a)+1 > (i8* b) -; CHECK: vector.memcheck: -; CHECK: found.conflict - -; CHECK-LABEL: vector.body: -; CHECK: %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ] -; CHECK: %wide.load = load <4 x i32> -; CHECK: [[ADD]] = add <4 x i32> %vec.phi, %wide.load -; CHECK-NEXT: store i32 %ntrunc, i32* %a -; CHECK-NEXT: %index.next = add nuw i64 %index, 4 -; CHECK-NEXT: icmp eq i64 %index.next, %n.vec -; CHECK-NEXT: br i1 - -; CHECK-LABEL: middle.block: -; CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> + + define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) { +; CHECK-LABEL: @inv_val_store_to_inv_address_with_reduction( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 +; CHECK-NEXT: [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1 +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !0 +; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I0:%.*]] = phi i32 [ [[I3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] +; CHECK-NEXT: [[I2:%.*]] = load i32, i32* [[I1]], align 8 +; CHECK-NEXT: [[I3]] = add i32 [[I0]], [[I2]] +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: [[I3_LCSSA:%.*]] = phi i32 [ [[I3]], [[FOR_BODY]] ] +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[I4:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ [[I3_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[I4]] +; entry: %ntrunc = trunc i64 %n to i32 br label %for.body @@ -49,14 +91,54 @@ for.end: ; preds = %for.body ret i32 %i4 } -; CHECK-LABEL: inv_val_store_to_inv_address( -; CHECK-LABEL: vector.body: -; CHECK: store i32 %ntrunc, i32* %a -; CHECK: store <4 x i32> -; CHECK-NEXT: %index.next = add nuw i64 %index, 4 -; CHECK-NEXT: icmp eq i64 %index.next, %n.vec -; CHECK-NEXT: br i1 define void @inv_val_store_to_inv_address(i32* %a, i64 %n, i32* %b) { +; CHECK-LABEL: @inv_val_store_to_inv_address( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 +; CHECK-NEXT: [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1 +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !8, !noalias !11 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP1]], align 4, !alias.scope !11 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4 +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[I1]], align 4 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; entry: %ntrunc = trunc i64 %n to i32 br label %for.body @@ -83,21 +165,90 @@ for.end: ; preds = %for.body ; TODO: We can be better with the code gen for the first test and we can have ; just one scalar store if vector.or.reduce(vector_cmp(b[i] == k)) is 1. -; CHECK-LABEL:inv_val_store_to_inv_address_conditional( -; CHECK-LABEL: vector.body: -; CHECK: %wide.load = load <4 x i32>, <4 x i32>* -; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, %{{.*}} -; CHECK: store <4 x i32> -; CHECK-NEXT: [[EE:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMP]], i32 0 -; CHECK-NEXT: br i1 [[EE]], label %pred.store.if, label %pred.store.continue -; CHECK-LABEL: pred.store.if: -; CHECK-NEXT: store i32 %ntrunc, i32* %a -; CHECK-NEXT: br label %pred.store.continue -; CHECK-LABEL: pred.store.continue: -; CHECK-NEXT: [[EE1:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMP]], i32 1 define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) { +; CHECK-LABEL: @inv_val_store_to_inv_address_conditional( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 +; CHECK-NEXT: [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]] +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[B]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[K:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !15, !noalias !18 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32>* [[TMP3]], align 4, !alias.scope !15, !noalias !18 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1 +; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] +; CHECK: pred.store.if9: +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] +; CHECK: pred.store.continue10: +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2 +; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] +; CHECK: pred.store.if11: +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] +; CHECK: pred.store.continue12: +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3 +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]] +; CHECK: pred.store.if13: +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]] +; CHECK: pred.store.continue14: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] +; CHECK-NEXT: [[I2:%.*]] = load i32, i32* [[I1]], align 8 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I2]], [[K]] +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[I1]], align 4 +; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]] +; CHECK: cond_store: +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4 +; CHECK-NEXT: br label [[LATCH]] +; CHECK: latch: +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; entry: %ntrunc = trunc i64 %n to i32 br label %for.body @@ -128,9 +279,31 @@ for.end: ; preds = %for.body ; else a = k; ; TODO: We could vectorize this once we support multiple uniform stores to the ; same address. -; CHECK-LABEL:inv_val_store_to_inv_address_conditional_diff_values( -; CHECK-NOT: load <4 x i32> define void @inv_val_store_to_inv_address_conditional_diff_values(i32* %a, i64 %n, i32* %b, i32 %k) { +; CHECK-LABEL: @inv_val_store_to_inv_address_conditional_diff_values( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[I]] +; CHECK-NEXT: [[I2:%.*]] = load i32, i32* [[I1]], align 8 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I2]], [[K:%.*]] +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[I1]], align 4 +; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]] +; CHECK: cond_store: +; CHECK-NEXT: br label [[LATCH]] +; CHECK: cond_store_k: +; CHECK-NEXT: br label [[LATCH]] +; CHECK: latch: +; CHECK-NEXT: [[STOREMERGE:%.*]] = phi i32 [ [[K]], [[COND_STORE_K]] ], [ [[NTRUNC]], [[COND_STORE]] ] +; CHECK-NEXT: store i32 [[STOREMERGE]], i32* [[A:%.*]], align 4 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; entry: %ntrunc = trunc i64 %n to i32 br label %for.body @@ -169,9 +342,47 @@ for.end: ; preds = %for.body ; } ; } -; CHECK-LABEL: multiple_uniform_stores -; CHECK-NOT: <4 x i32> define i32 @multiple_uniform_stores(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 { +; CHECK-LABEL: @multiple_uniform_stores( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP20]], label [[FOR_END10:%.*]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]] +; CHECK: for.cond1.preheader.preheader: +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] +; CHECK: for.cond1.preheader: +; CHECK-NEXT: [[INDVARS_IV23:%.*]] = phi i64 [ [[INDVARS_IV_NEXT24:%.*]], [[FOR_INC8:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ] +; CHECK-NEXT: [[J_022:%.*]] = phi i32 [ [[J_1_LCSSA:%.*]], [[FOR_INC8]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ] +; CHECK-NEXT: [[CMP218:%.*]] = icmp ult i32 [[J_022]], [[ITR]] +; CHECK-NEXT: br i1 [[CMP218]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_INC8]] +; CHECK: for.body3.lr.ph: +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[VAR1:%.*]], i64 [[INDVARS_IV23]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[J_022]] to i64 +; CHECK-NEXT: br label [[FOR_BODY3:%.*]] +; CHECK: for.body3: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY3_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[VAR2:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add nsw i32 [[ADD]], 1 +; CHECK-NEXT: store i32 [[TMP3]], i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[ITR]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_INC8_LOOPEXIT:%.*]], label [[FOR_BODY3]] +; CHECK: for.inc8.loopexit: +; CHECK-NEXT: br label [[FOR_INC8]] +; CHECK: for.inc8: +; CHECK-NEXT: [[J_1_LCSSA]] = phi i32 [ [[J_022]], [[FOR_COND1_PREHEADER]] ], [ [[ITR]], [[FOR_INC8_LOOPEXIT]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT24]] = add nuw nsw i64 [[INDVARS_IV23]], 1 +; CHECK-NEXT: [[LFTR_WIDEIV25:%.*]] = trunc i64 [[INDVARS_IV_NEXT24]] to i32 +; CHECK-NEXT: [[EXITCOND26:%.*]] = icmp eq i32 [[LFTR_WIDEIV25]], [[ITR]] +; CHECK-NEXT: br i1 [[EXITCOND26]], label [[FOR_END10_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]] +; CHECK: for.end10.loopexit: +; CHECK-NEXT: br label [[FOR_END10]] +; CHECK: for.end10: +; CHECK-NEXT: ret i32 undef +; entry: %cmp20 = icmp eq i32 %itr, 0 br i1 %cmp20, label %for.end10, label %for.cond1.preheader @@ -215,9 +426,53 @@ for.end10: ; preds = %for.inc8, %entry ; second uniform store to the same address is conditional. ; we do not vectorize this. -; CHECK-LABEL: multiple_uniform_stores_conditional -; CHECK-NOT: <4 x i32> define i32 @multiple_uniform_stores_conditional(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 { +; CHECK-LABEL: @multiple_uniform_stores_conditional( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP20]], label [[FOR_END10:%.*]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]] +; CHECK: for.cond1.preheader.preheader: +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] +; CHECK: for.cond1.preheader: +; CHECK-NEXT: [[INDVARS_IV23:%.*]] = phi i64 [ [[INDVARS_IV_NEXT24:%.*]], [[FOR_INC8:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ] +; CHECK-NEXT: [[J_022:%.*]] = phi i32 [ [[J_1_LCSSA:%.*]], [[FOR_INC8]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ] +; CHECK-NEXT: [[CMP218:%.*]] = icmp ult i32 [[J_022]], [[ITR]] +; CHECK-NEXT: br i1 [[CMP218]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_INC8]] +; CHECK: for.body3.lr.ph: +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[VAR1:%.*]], i64 [[INDVARS_IV23]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[J_022]] to i64 +; CHECK-NEXT: br label [[FOR_BODY3:%.*]] +; CHECK: for.body3: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY3_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[VAR2:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i32 [[ADD]], 42 +; CHECK-NEXT: br i1 [[TMP3]], label [[COND_STORE:%.*]], label [[LATCH]] +; CHECK: cond_store: +; CHECK-NEXT: [[TMP4:%.*]] = add nsw i32 [[ADD]], 1 +; CHECK-NEXT: br label [[LATCH]] +; CHECK: latch: +; CHECK-NEXT: [[STOREMERGE:%.*]] = phi i32 [ [[TMP4]], [[COND_STORE]] ], [ [[ADD]], [[FOR_BODY3]] ] +; CHECK-NEXT: store i32 [[STOREMERGE]], i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[ITR]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_INC8_LOOPEXIT:%.*]], label [[FOR_BODY3]] +; CHECK: for.inc8.loopexit: +; CHECK-NEXT: br label [[FOR_INC8]] +; CHECK: for.inc8: +; CHECK-NEXT: [[J_1_LCSSA]] = phi i32 [ [[J_022]], [[FOR_COND1_PREHEADER]] ], [ [[ITR]], [[FOR_INC8_LOOPEXIT]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT24]] = add nuw nsw i64 [[INDVARS_IV23]], 1 +; CHECK-NEXT: [[LFTR_WIDEIV25:%.*]] = trunc i64 [[INDVARS_IV_NEXT24]] to i32 +; CHECK-NEXT: [[EXITCOND26:%.*]] = icmp eq i32 [[LFTR_WIDEIV25]], [[ITR]] +; CHECK-NEXT: br i1 [[EXITCOND26]], label [[FOR_END10_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]] +; CHECK: for.end10.loopexit: +; CHECK-NEXT: br label [[FOR_END10]] +; CHECK: for.end10: +; CHECK-NEXT: ret i32 undef +; entry: %cmp20 = icmp eq i32 %itr, 0 br i1 %cmp20, label %for.end10, label %for.cond1.preheader @@ -272,8 +527,35 @@ for.end10: ; preds = %for.inc8, %entry ; Note: %i10 could be replaced by phi(%arg4, %i12), a potentially vectorizable ; 1st-order-recurrence define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, i16* %arg3, i32 %arg4, i64 %arg5) { -; CHECK-LABEL: unsafe_dep_uniform_load_store -; CHECK-NOT: <4 x i32> +; CHECK-LABEL: @unsafe_dep_uniform_load_store( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[I6:%.*]] = getelementptr inbounds i16, i16* [[ARG3:%.*]], i64 [[ARG5:%.*]] +; CHECK-NEXT: br label [[BB7:%.*]] +; CHECK: bb7: +; CHECK-NEXT: [[I121:%.*]] = phi i32 [ [[ARG4:%.*]], [[BB:%.*]] ], [ [[I12:%.*]], [[BB7]] ] +; CHECK-NEXT: [[I8:%.*]] = phi i64 [ 0, [[BB]] ], [ [[I24:%.*]], [[BB7]] ] +; CHECK-NEXT: [[I9:%.*]] = phi i32 [ [[ARG1:%.*]], [[BB]] ], [ [[I23:%.*]], [[BB7]] ] +; CHECK-NEXT: [[I11:%.*]] = mul nsw i32 [[I9]], [[I121]] +; CHECK-NEXT: [[I12]] = srem i32 [[I11]], 65536 +; CHECK-NEXT: [[I13:%.*]] = add nsw i32 [[I12]], [[I9]] +; CHECK-NEXT: [[I14:%.*]] = trunc i32 [[I13]] to i16 +; CHECK-NEXT: [[I15:%.*]] = trunc i64 [[I8]] to i32 +; CHECK-NEXT: [[I16:%.*]] = add i32 [[I15]], [[ARG:%.*]] +; CHECK-NEXT: [[I17:%.*]] = zext i32 [[I16]] to i64 +; CHECK-NEXT: [[I18:%.*]] = getelementptr inbounds i16, i16* [[I6]], i64 [[I17]] +; CHECK-NEXT: store i16 [[I14]], i16* [[I18]], align 2 +; CHECK-NEXT: [[I19:%.*]] = add i32 [[I13]], [[I9]] +; CHECK-NEXT: [[I20:%.*]] = trunc i32 [[I19]] to i16 +; CHECK-NEXT: [[I21:%.*]] = and i16 [[I20]], 255 +; CHECK-NEXT: [[I22:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i64 [[I17]] +; CHECK-NEXT: store i16 [[I21]], i16* [[I22]], align 2 +; CHECK-NEXT: [[I23]] = add nsw i32 [[I9]], 1 +; CHECK-NEXT: [[I24]] = add nuw nsw i64 [[I8]], 1 +; CHECK-NEXT: [[I25:%.*]] = icmp eq i64 [[I24]], [[ARG2:%.*]] +; CHECK-NEXT: br i1 [[I25]], label [[BB26:%.*]], label [[BB7]] +; CHECK: bb26: +; CHECK-NEXT: ret void +; bb: %i = alloca i32 store i32 %arg4, i32* %i @@ -309,4 +591,3 @@ bb26: } ; Make sure any check-not directives are not triggered by function declarations. -; CHECK: declare diff --git a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll index d1c38b6ce4098c..ec4a3397593349 100644 --- a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll +++ b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll @@ -1,31 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; REQUIRES: asserts ; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine -enable-new-pm=0 2>&1 | FileCheck %s ; RUN: opt < %s -aa-pipeline=basic-aa -passes=loop-vectorize,instcombine -force-vector-width=2 -force-vector-interleave=1 -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" -; CHECK-LABEL: vector_gep -; CHECK-NOT: LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i +; +define void @vector_gep(i32** %a, i32 *%b, i64 %n) { +; CHECK-LABEL: @vector_gep( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, <2 x i64> [[VEC_IND]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <2 x i32*>* -; CHECK-NEXT: store <2 x i32*> [[TMP1]], <2 x i32*>* [[TMP3]], align 8 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <2 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32*, i32** [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32** [[TMP1]] to <2 x i32*>* +; CHECK-NEXT: store <2 x i32*> [[TMP0]], <2 x i32*>* [[TMP2]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], -; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] +; CHECK-NEXT: [[VAR1:%.*]] = getelementptr inbounds i32*, i32** [[A]], i64 [[I]] +; CHECK-NEXT: store i32* [[VAR0]], i32** [[VAR1]], align 8 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void ; -define void @vector_gep(i32** %a, i32 *%b, i64 %n) { entry: br label %for.body for.body: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] - %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i - %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i - store i32* %tmp0, i32** %tmp1, align 8 + %var0 = getelementptr inbounds i32, i32* %b, i64 %i + %var1 = getelementptr inbounds i32*, i32** %a, i64 %i + store i32* %var0, i32** %var1, align 8 %i.next = add nuw nsw i64 %i, 1 %cond = icmp slt i64 %i.next, %n br i1 %cond, label %for.body, label %for.end @@ -34,33 +59,58 @@ for.end: ret void } -; CHECK-LABEL: scalar_store -; CHECK: LV: Found scalar instruction: %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i -; CHECK-NEXT: LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i -; CHECK-NEXT: LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] -; CHECK-NEXT: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2 +; +define void @scalar_store(i32** %a, i32 *%b, i64 %n) { +; CHECK-LABEL: @scalar_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 2) +; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775806 +; CHECK-NEXT: [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* %b, i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* %b, i64 [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32*, i32** [[A:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32*, i32** [[A]], i64 [[TMP3]] +; CHECK-NEXT: store i32* [[TMP4]], i32** [[TMP6]], align 8 ; CHECK-NEXT: store i32* [[TMP5]], i32** [[TMP7]], align 8 -; CHECK-NEXT: store i32* [[TMP6]], i32** [[TMP8]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] +; CHECK-NEXT: [[VAR1:%.*]] = getelementptr inbounds i32*, i32** [[A]], i64 [[I]] +; CHECK-NEXT: store i32* [[VAR0]], i32** [[VAR1]], align 8 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void ; -define void @scalar_store(i32** %a, i32 *%b, i64 %n) { entry: br label %for.body for.body: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] - %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i - %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i - store i32* %tmp0, i32** %tmp1, align 8 + %var0 = getelementptr inbounds i32, i32* %b, i64 %i + %var1 = getelementptr inbounds i32*, i32** %a, i64 %i + store i32* %var0, i32** %var1, align 8 %i.next = add nuw nsw i64 %i, 2 %cond = icmp slt i64 %i.next, %n br i1 %cond, label %for.body, label %for.end @@ -69,39 +119,63 @@ for.end: ret void } -; CHECK-LABEL: expansion -; CHECK: LV: Found scalar instruction: %tmp3 = getelementptr inbounds i32*, i32** %tmp2, i64 %i -; CHECK-NEXT: LV: Found scalar instruction: %tmp1 = bitcast i64* %tmp0 to i32* -; CHECK-NEXT: LV: Found scalar instruction: %tmp2 = getelementptr inbounds i32*, i32** %a, i64 0 -; CHECK-NEXT: LV: Found scalar instruction: %tmp0 = getelementptr inbounds i64, i64* %b, i64 %i -; CHECK-NEXT: LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] -; CHECK-NEXT: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2 +; +define void @expansion(i32** %a, i64 *%b, i64 %n) { +; CHECK-LABEL: @expansion( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 2) +; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775806 +; CHECK-NEXT: [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64* %b, i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, i64* %b, i64 [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32*, i32** [[A:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32*, i32** [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32** [[TMP6]] to i64** +; CHECK-NEXT: store i64* [[TMP4]], i64** [[TMP8]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32** [[TMP7]] to i64** ; CHECK-NEXT: store i64* [[TMP5]], i64** [[TMP9]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32** [[TMP8]] to i64** -; CHECK-NEXT: store i64* [[TMP6]], i64** [[TMP10]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[VAR0:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[I]] +; CHECK-NEXT: [[VAR3:%.*]] = getelementptr inbounds i32*, i32** [[A]], i64 [[I]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32** [[VAR3]] to i64** +; CHECK-NEXT: store i64* [[VAR0]], i64** [[TMP11]], align 8 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void ; -define void @expansion(i32** %a, i64 *%b, i64 %n) { entry: br label %for.body for.body: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] - %tmp0 = getelementptr inbounds i64, i64* %b, i64 %i - %tmp1 = bitcast i64* %tmp0 to i32* - %tmp2 = getelementptr inbounds i32*, i32** %a, i64 0 - %tmp3 = getelementptr inbounds i32*, i32** %tmp2, i64 %i - store i32* %tmp1, i32** %tmp3, align 8 + %var0 = getelementptr inbounds i64, i64* %b, i64 %i + %var1 = bitcast i64* %var0 to i32* + %var2 = getelementptr inbounds i32*, i32** %a, i64 0 + %var3 = getelementptr inbounds i32*, i32** %var2, i64 %i + store i32* %var1, i32** %var3, align 8 %i.next = add nuw nsw i64 %i, 2 %cond = icmp slt i64 %i.next, %n br i1 %cond, label %for.body, label %for.end @@ -110,31 +184,53 @@ for.end: ret void } -; CHECK-LABEL: no_gep_or_bitcast -; CHECK-NOT: LV: Found scalar instruction: %tmp1 = load i32*, i32** %tmp0, align 8 -; CHECK: LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] -; CHECK-NEXT: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 1 +; +define void @no_gep_or_bitcast(i32** noalias %a, i64 %n) { +; CHECK-LABEL: @no_gep_or_bitcast( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32** [[TMP1]] to <2 x i32*>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32*, i32** [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32** [[TMP0]] to <2 x i32*>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: store i32 0, i32* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 1 ; CHECK-NEXT: store i32 0, i32* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: store i32 0, i32* [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32*, i32** [[A]], i64 [[I]] +; CHECK-NEXT: [[VAR1:%.*]] = load i32*, i32** [[VAR0]], align 8 +; CHECK-NEXT: store i32 0, i32* [[VAR1]], align 8 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void ; -define void @no_gep_or_bitcast(i32** noalias %a, i64 %n) { entry: br label %for.body for.body: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] - %tmp0 = getelementptr inbounds i32*, i32** %a, i64 %i - %tmp1 = load i32*, i32** %tmp0, align 8 - store i32 0, i32* %tmp1, align 8 + %var0 = getelementptr inbounds i32*, i32** %a, i64 %i + %var1 = load i32*, i32** %var0, align 8 + store i32 0, i32* %var1, align 8 %i.next = add nuw nsw i64 %i, 1 %cond = icmp slt i64 %i.next, %n br i1 %cond, label %for.body, label %for.end diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll index 0ded0899078fbb..0f4e4d0988fa52 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -scalable-vectorization=on -force-target-instruction-cost=1 -force-target-supports-scalable-vectors -dce -instcombine < %s -S | FileCheck %s ; Test that we can add on the induction variable @@ -9,25 +10,72 @@ define void @add_ind64_unrolled(i64* noalias nocapture %a, i64* noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @add_ind64_unrolled( ; CHECK-NEXT: entry: -; CHECK: vector.body: -; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ] -; CHECK-NEXT: %[[STEPVEC:.*]] = call @llvm.experimental.stepvector.nxv2i64() -; CHECK-NEXT: %[[TMP1:.*]] = insertelement poison, i64 %[[INDEX]], i32 0 -; CHECK-NEXT: %[[IDXSPLT:.*]] = shufflevector %[[TMP1]], poison, zeroinitializer -; CHECK-NEXT: %[[VECIND1:.*]] = add %[[IDXSPLT]], %[[STEPVEC]] -; CHECK-NEXT: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: %[[EC:.*]] = shl i64 %[[VSCALE]], 1 -; CHECK-NEXT: %[[TMP2:.*]] = insertelement poison, i64 %[[EC]], i32 0 -; CHECK-NEXT: %[[ECSPLT:.*]] = shufflevector %[[TMP2]], poison, zeroinitializer -; CHECK-NEXT: %[[TMP3:.*]] = add %[[ECSPLT]], %[[STEPVEC]] -; CHECK-NEXT: %[[VECIND2:.*]] = add %[[IDXSPLT]], %[[TMP3]] -; CHECK: %[[LOAD1:.*]] = load -; CHECK: %[[LOAD2:.*]] = load -; CHECK: %[[STOREVAL1:.*]] = add nsw %[[LOAD1]], %[[VECIND1]] -; CHECK: %[[STOREVAL2:.*]] = add nsw %[[LOAD2]], %[[VECIND2]] -; CHECK: store %[[STOREVAL1]] -; CHECK: store %[[STOREVAL2]] - +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement poison, i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector [[DOTSPLATINSERT2]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = add [[DOTSPLAT3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement poison, i64 [[TMP7]], i32 0 +; CHECK-NEXT: [[DOTSPLAT5:%.*]] = shufflevector [[DOTSPLATINSERT4]], poison, zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = add [[DOTSPLAT5]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = add [[DOTSPLAT3]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP10]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP11]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP13:%.*]] = shl i32 [[TMP12]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i64* [[TMP15]] to * +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , * [[TMP16]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = add nsw [[WIDE_LOAD]], [[TMP5]] +; CHECK-NEXT: [[TMP18:%.*]] = add nsw [[WIDE_LOAD6]], [[TMP9]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i64* [[TMP19]] to * +; CHECK-NEXT: store [[TMP17]], * [[TMP20]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP22:%.*]] = shl i32 [[TMP21]], 1 +; CHECK-NEXT: [[TMP23:%.*]] = sext i32 [[TMP22]] to i64 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, i64* [[TMP19]], i64 [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = bitcast i64* [[TMP24]] to * +; CHECK-NEXT: store [[TMP18]], * [[TMP25]], align 8 +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP27:%.*]] = shl i64 [[TMP26]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP27]] +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[I_08]] +; CHECK-NEXT: [[TMP29:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP29]], [[I_08]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I_08]] +; CHECK-NEXT: store i64 [[ADD]], i64* [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -52,24 +100,69 @@ exit: ; preds = %for.body define void @add_ind64_unrolled_nxv1i64(i64* noalias nocapture %a, i64* noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @add_ind64_unrolled_nxv1i64( ; CHECK-NEXT: entry: -; CHECK: vector.body: -; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ] -; CHECK-NEXT: %[[STEPVEC:.*]] = call @llvm.experimental.stepvector.nxv1i64() -; CHECK-NEXT: %[[TMP1:.*]] = insertelement poison, i64 %[[INDEX]], i32 0 -; CHECK-NEXT: %[[IDXSPLT:.*]] = shufflevector %[[TMP1]], poison, zeroinitializer -; CHECK-NEXT: %[[VECIND1:.*]] = add %[[IDXSPLT]], %[[STEPVEC]] -; CHECK-NEXT: %[[EC:.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: %[[TMP2:.*]] = insertelement poison, i64 %[[EC]], i32 0 -; CHECK-NEXT: %[[ECSPLT:.*]] = shufflevector %[[TMP2]], poison, zeroinitializer -; CHECK-NEXT: %[[TMP3:.*]] = add %[[ECSPLT]], %[[STEPVEC]] -; CHECK-NEXT: %[[VECIND2:.*]] = add %[[IDXSPLT]], %[[TMP3]] -; CHECK: %[[LOAD1:.*]] = load -; CHECK: %[[LOAD2:.*]] = load -; CHECK: %[[STOREVAL1:.*]] = add nsw %[[LOAD1]], %[[VECIND1]] -; CHECK: %[[STOREVAL2:.*]] = add nsw %[[LOAD2]], %[[VECIND2]] -; CHECK: store %[[STOREVAL1]] -; CHECK: store %[[STOREVAL2]] - +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv1i64() +; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement poison, i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector [[DOTSPLATINSERT2]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = add [[DOTSPLAT3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement poison, i64 [[TMP6]], i32 0 +; CHECK-NEXT: [[DOTSPLAT5:%.*]] = shufflevector [[DOTSPLATINSERT4]], poison, zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = add [[DOTSPLAT5]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = add [[DOTSPLAT3]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[TMP9]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP10]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[TMP9]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64* [[TMP13]] to * +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , * [[TMP14]], align 8 +; CHECK-NEXT: [[TMP15:%.*]] = add nsw [[WIDE_LOAD]], [[TMP5]] +; CHECK-NEXT: [[TMP16:%.*]] = add nsw [[WIDE_LOAD6]], [[TMP8]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64* [[TMP17]] to * +; CHECK-NEXT: store [[TMP15]], * [[TMP18]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[TMP19]] to i64 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, i64* [[TMP17]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i64* [[TMP21]] to * +; CHECK-NEXT: store [[TMP16]], * [[TMP22]], align 8 +; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP24:%.*]] = shl i64 [[TMP23]], 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP24]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[I_08]] +; CHECK-NEXT: [[TMP26:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP26]], [[I_08]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I_08]] +; CHECK-NEXT: store i64 [[ADD]], i64* [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -100,17 +193,56 @@ exit: ; preds = %for.body define void @add_unique_ind32(i32* noalias nocapture %a, i64 %n) { ; CHECK-LABEL: @add_unique_ind32( -; CHECK: vector.ph: -; CHECK: %[[STEPVEC:.*]] = call @llvm.experimental.stepvector.nxv4i32() -; CHECK-NEXT: %[[INDINIT:.*]] = shl %[[STEPVEC]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) -; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: %[[INC:.*]] = shl i32 %[[VSCALE]], 3 -; CHECK-NEXT: %[[TMP:.*]] = insertelement poison, i32 %[[INC]], i32 0 -; CHECK-NEXT: %[[VECINC:.*]] = shufflevector %[[TMP]], poison, zeroinitializer -; CHECK: vector.body: -; CHECK: %[[VECIND:.*]] = phi [ %[[INDINIT]], %vector.ph ], [ %[[VECINDNXT:.*]], %vector.body ] -; CHECK: store %[[VECIND]] -; CHECK: %[[VECINDNXT]] = add %[[VECIND]], %[[VECINC]] +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[CAST_CRD]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP5:%.*]] = shl [[TMP4]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP7:%.*]] = shl i32 [[TMP6]], 3 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to * +; CHECK-NEXT: store [[VEC_IND]], * [[TMP9]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_08]] +; CHECK-NEXT: store i32 [[R_07]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[R_07]], 2 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -139,22 +271,61 @@ exit: ; preds = %for.body define void @add_unique_indf32(float* noalias nocapture %a, i64 %n) { ; CHECK-LABEL: @add_unique_indf32( -; CHECK: vector.ph: -; CHECK: %[[STEPVEC:.*]] = call @llvm.experimental.stepvector.nxv4i32() -; CHECK-NEXT: %[[TMP1:.*]] = uitofp %[[STEPVEC]] to -; CHECK-NEXT: %[[TMP2:.*]] = fmul %[[TMP1]], shufflevector ( insertelement ( poison, float 2.000000e+00, i32 0), poison, zeroinitializer) -; CHECK-NEXT: %[[INDINIT:.*]] = fadd %[[TMP2]], zeroinitializer -; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: %[[TMP3:.*]] = shl i32 %8, 2 -; CHECK-NEXT: %[[TMP4:.*]] = uitofp i32 %[[TMP3]] to float -; CHECK-NEXT: %[[INC:.*]] = fmul float %[[TMP4]], 2.000000e+00 -; CHECK-NEXT: %[[TMP5:.*]] = insertelement poison, float %[[INC]], i32 0 -; CHECK-NEXT: %[[VECINC:.*]] = shufflevector %[[TMP5]], poison, zeroinitializer -; CHECK: vector.body: -; CHECK: %[[VECIND:.*]] = phi [ %[[INDINIT]], %vector.ph ], [ %[[VECINDNXT:.*]], %vector.body ] -; CHECK: store %[[VECIND]] -; CHECK: %[[VECINDNXT]] = fadd %[[VECIND]], %[[VECINC]] - +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float +; CHECK-NEXT: [[TMP4:%.*]] = fmul float [[CAST_CRD]], 2.000000e+00 +; CHECK-NEXT: [[IND_END:%.*]] = fadd float [[TMP4]], 0.000000e+00 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP6:%.*]] = uitofp [[TMP5]] to +; CHECK-NEXT: [[TMP7:%.*]] = fmul [[TMP6]], shufflevector ( insertelement ( poison, float 2.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = fadd [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP9:%.*]] = shl i32 [[TMP8]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float +; CHECK-NEXT: [[TMP11:%.*]] = fmul float [[TMP10]], 2.000000e+00 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, float [[TMP11]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to * +; CHECK-NEXT: store [[VEC_IND]], * [[TMP13]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = shl i64 [[TMP14]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = fadd [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi float [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_07:%.*]] = phi float [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[I_08]] +; CHECK-NEXT: store float [[R_07]], float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = fadd float [[R_07]], 2.000000e+00 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/vector-geps.ll b/llvm/test/Transforms/LoopVectorize/vector-geps.ll index beed0542db9362..e371b98e06fc9e 100644 --- a/llvm/test/Transforms/LoopVectorize/vector-geps.ll +++ b/llvm/test/Transforms/LoopVectorize/vector-geps.ll @@ -1,28 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -S | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +; +define void @vector_gep_stored(i32** %a, i32 *%b, i64 %n) { ; CHECK-LABEL: @vector_gep_stored( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, <4 x i64> [[VEC_IND]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <4 x i32*>* -; CHECK-NEXT: store <4 x i32*> [[TMP1]], <4 x i32*>* [[TMP3]], align 8 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32*, i32** [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32** [[TMP1]] to <4 x i32*>* +; CHECK-NEXT: store <4 x i32*> [[TMP0]], <4 x i32*>* [[TMP2]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] +; CHECK-NEXT: [[VAR1:%.*]] = getelementptr inbounds i32*, i32** [[A]], i64 [[I]] +; CHECK-NEXT: store i32* [[VAR0]], i32** [[VAR1]], align 8 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void ; -define void @vector_gep_stored(i32** %a, i32 *%b, i64 %n) { entry: br label %for.body for.body: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] - %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i - %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i - store i32* %tmp0, i32** %tmp1, align 8 + %var0 = getelementptr inbounds i32, i32* %b, i64 %i + %var1 = getelementptr inbounds i32*, i32** %a, i64 %i + store i32* %var0, i32** %var1, align 8 %i.next = add nuw nsw i64 %i, 1 %cond = icmp slt i64 %i.next, %n br i1 %cond, label %for.body, label %for.end @@ -31,27 +57,52 @@ for.end: ret void } +; +define void @uniform_vector_gep_stored(i32** %a, i32 *%b, i64 %n) { ; CHECK-LABEL: @uniform_vector_gep_stored( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, i64 1 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP0]], i32 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32*> [[DOTSPLATINSERT]], <4 x i32*> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <4 x i32*>* -; CHECK-NEXT: store <4 x i32*> [[DOTSPLAT]], <4 x i32*>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32*, i32** [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32** [[TMP1]] to <4 x i32*>* +; CHECK-NEXT: store <4 x i32*> [[DOTSPLAT]], <4 x i32*>* [[TMP2]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[VAR0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1 +; CHECK-NEXT: [[VAR1:%.*]] = getelementptr inbounds i32*, i32** [[A]], i64 [[I]] +; CHECK-NEXT: store i32* [[VAR0]], i32** [[VAR1]], align 8 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void ; -define void @uniform_vector_gep_stored(i32** %a, i32 *%b, i64 %n) { entry: br label %for.body for.body: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] - %tmp0 = getelementptr inbounds i32, i32* %b, i64 1 - %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i - store i32* %tmp0, i32** %tmp1, align 8 + %var0 = getelementptr inbounds i32, i32* %b, i64 1 + %var1 = getelementptr inbounds i32*, i32** %a, i64 %i + store i32* %var0, i32** %var1, align 8 %i.next = add nuw nsw i64 %i, 1 %cond = icmp slt i64 %i.next, %n br i1 %cond, label %for.body, label %for.end