diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll index 3aa682cd2971c..b66e19e8bc563 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll @@ -1144,3 +1144,53 @@ define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> % %v4f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <4 x i32> ret void } + +define void @insert(<16 x i8> %i8v16, <16 x i8> %i8v16_2) { +; ALL-LABEL: 'insert' +; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %test0 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %test1 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %test2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %test3 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %test4 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %test5 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %test6 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %test7 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; +; ALL-SIZE-LABEL: 'insert' +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %test0 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %test1 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %test2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %test3 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %test4 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %test5 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %test6 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %test7 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %test0 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> + %test1 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> + %test2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> + %test3 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> + %test4 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> + %test5 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> + %test6 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> + %test7 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> + ret void +} + +define void @splice(<16 x i8> %i8v16, <16 x i8> %i8v16_2) { +; ALL-LABEL: 'splice' +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %test0 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %test1 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; +; ALL-SIZE-LABEL: 'splice' +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %test0 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %test1 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %test0 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> + %test1 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll index f71fdbdee527b..c9a013bd58322 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name I --version 6 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s @@ -5,9 +6,18 @@ ; FIXME: Should not vectorize on gfx8 -; GCN-LABEL: @fadd_combine_v2f16 -; GCN: fadd <2 x half> define void @fadd_combine_v2f16(ptr addrspace(1) %arg) { +; GCN-LABEL: define void @fadd_combine_v2f16( +; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; GCN-NEXT: [[BB:.*:]] +; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = fadd <2 x half> [[TMP0]], splat (half 0xH3C00) +; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -24,9 +34,18 @@ bb: } ; FIXME: Should not vectorize on gfx8 -; GCN-LABEL: @fsub_combine_v2f16 -; GCN: fsub <2 x half> define void @fsub_combine_v2f16(ptr addrspace(1) %arg) { +; GCN-LABEL: define void @fsub_combine_v2f16( +; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GCN-NEXT: [[BB:.*:]] +; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = fsub <2 x half> [[TMP0]], splat (half 0xH3C00) +; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -43,9 +62,18 @@ bb: } ; FIXME: Should not vectorize on gfx8 -; GCN-LABEL: @fmul_combine_v2f16 -; GCN: fmul <2 x half> define void @fmul_combine_v2f16(ptr addrspace(1) %arg) { +; GCN-LABEL: define void @fmul_combine_v2f16( +; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GCN-NEXT: [[BB:.*:]] +; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = fmul <2 x half> [[TMP0]], splat (half 0xH3C00) +; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -61,9 +89,18 @@ bb: ret void } -; GCN-LABEL: @fdiv_combine_v2f16 -; GCN: fdiv <2 x half> define void @fdiv_combine_v2f16(ptr addrspace(1) %arg) { +; GCN-LABEL: define void @fdiv_combine_v2f16( +; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GCN-NEXT: [[BB:.*:]] +; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = fdiv <2 x half> [[TMP0]], splat (half 0xH3C00) +; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -79,9 +116,18 @@ bb: ret void } -; GCN-LABEL: @frem_combine_v2f16 -; GCN: frem <2 x half> define void @frem_combine_v2f16(ptr addrspace(1) %arg) { +; GCN-LABEL: define void @frem_combine_v2f16( +; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GCN-NEXT: [[BB:.*:]] +; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = frem <2 x half> [[TMP0]], splat (half 0xH3C00) +; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -98,9 +144,18 @@ bb: } ; FIXME: Should not vectorize on gfx8 -; GCN-LABEL: @fma_combine_v2f16 -; GCN: call <2 x half> @llvm.fma.v2f16 define amdgpu_kernel void @fma_combine_v2f16(ptr addrspace(1) %arg) { +; GCN-LABEL: define amdgpu_kernel void @fma_combine_v2f16( +; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GCN-NEXT: [[BB:.*:]] +; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00), <2 x half> splat (half 0xH3C00)) +; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -117,9 +172,18 @@ bb: } ; FIXME: Should not vectorize on gfx8 -; GCN-LABEL: @fmuladd_combine_v2f16 -; GCN: call <2 x half> @llvm.fmuladd.v2f16 define amdgpu_kernel void @fmuladd_combine_v2f16(ptr addrspace(1) %arg) { +; GCN-LABEL: define amdgpu_kernel void @fmuladd_combine_v2f16( +; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GCN-NEXT: [[BB:.*:]] +; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00), <2 x half> splat (half 0xH3C00)) +; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -135,12 +199,35 @@ bb: ret void } -; GCN-LABEL: @minnum_combine_v2f16 -; GFX8: call half @llvm.minnum.f16( -; GFX8: call half @llvm.minnum.f16( -; GFX9: call <2 x half> @llvm.minnum.v2f16 define void @minnum_combine_v2f16(ptr addrspace(1) %arg) { +; GFX8-LABEL: define void @minnum_combine_v2f16( +; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GFX8-NEXT: [[BB:.*:]] +; GFX8-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GFX8-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GFX8-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GFX8-NEXT: [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2 +; GFX8-NEXT: [[ITMP4:%.*]] = call half @llvm.minnum.f16(half [[ITMP3]], half 0xH3C00) +; GFX8-NEXT: store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2 +; GFX8-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1 +; GFX8-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]] +; GFX8-NEXT: [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2 +; GFX8-NEXT: [[ITMP8:%.*]] = call half @llvm.minnum.f16(half [[ITMP7]], half 0xH3C00) +; GFX8-NEXT: store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2 +; GFX8-NEXT: ret void +; +; GFX9-LABEL: define void @minnum_combine_v2f16( +; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: [[BB:.*:]] +; GFX9-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GFX9-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GFX9-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GFX9-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2 +; GFX9-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00)) +; GFX9-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2 +; GFX9-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -156,12 +243,35 @@ bb: ret void } -; GCN-LABEL: @maxnum_combine_v2f16 -; GFX8: call half @llvm.maxnum.f16( -; GFX8: call half @llvm.maxnum.f16( -; GFX9: call <2 x half> @llvm.maxnum.v2f16 define void @maxnum_combine_v2f16(ptr addrspace(1) %arg) { +; GFX8-LABEL: define void @maxnum_combine_v2f16( +; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GFX8-NEXT: [[BB:.*:]] +; GFX8-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GFX8-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GFX8-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GFX8-NEXT: [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2 +; GFX8-NEXT: [[ITMP4:%.*]] = call half @llvm.maxnum.f16(half [[ITMP3]], half 0xH3C00) +; GFX8-NEXT: store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2 +; GFX8-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1 +; GFX8-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]] +; GFX8-NEXT: [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2 +; GFX8-NEXT: [[ITMP8:%.*]] = call half @llvm.maxnum.f16(half [[ITMP7]], half 0xH3C00) +; GFX8-NEXT: store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2 +; GFX8-NEXT: ret void +; +; GFX9-LABEL: define void @maxnum_combine_v2f16( +; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: [[BB:.*:]] +; GFX9-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GFX9-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GFX9-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GFX9-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2 +; GFX9-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00)) +; GFX9-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2 +; GFX9-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -178,10 +288,23 @@ bb: } ; FIXME: Should vectorize -; GCN-LABEL: @minimum_combine_v2f16 -; GCN: call half @llvm.minimum.f16( -; GCN: call half @llvm.minimum.f16( define void @minimum_combine_v2f16(ptr addrspace(1) %arg) { +; GCN-LABEL: define void @minimum_combine_v2f16( +; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GCN-NEXT: [[BB:.*:]] +; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GCN-NEXT: [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: [[ITMP4:%.*]] = call half @llvm.minimum.f16(half [[ITMP3]], half 0xH3C00) +; GCN-NEXT: store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1 +; GCN-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]] +; GCN-NEXT: [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2 +; GCN-NEXT: [[ITMP8:%.*]] = call half @llvm.minimum.f16(half [[ITMP7]], half 0xH3C00) +; GCN-NEXT: store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2 +; GCN-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -197,10 +320,23 @@ bb: ret void } -; GCN-LABEL: @maximum_combine_v2f16 -; GCN: call half @llvm.maximum.f16( -; GCN: call half @llvm.maximum.f16( define void @maximum_combine_v2f16(ptr addrspace(1) %arg) { +; GCN-LABEL: define void @maximum_combine_v2f16( +; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GCN-NEXT: [[BB:.*:]] +; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GCN-NEXT: [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: [[ITMP4:%.*]] = call half @llvm.maximum.f16(half [[ITMP3]], half 0xH3C00) +; GCN-NEXT: store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1 +; GCN-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]] +; GCN-NEXT: [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2 +; GCN-NEXT: [[ITMP8:%.*]] = call half @llvm.maximum.f16(half [[ITMP7]], half 0xH3C00) +; GCN-NEXT: store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2 +; GCN-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -216,9 +352,18 @@ bb: ret void } -; GCN-LABEL: @canonicalize_combine_v2f16 -; GCN: call <2 x half> @llvm.canonicalize.v2f16( define void @canonicalize_combine_v2f16(ptr addrspace(1) %arg) { +; GCN-LABEL: define void @canonicalize_combine_v2f16( +; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GCN-NEXT: [[BB:.*:]] +; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP0]]) +; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -234,9 +379,18 @@ bb: ret void } -; GCN-LABEL: @fabs_combine_v2f16 -; GCN: call <2 x half> @llvm.fabs.v2f16( define void @fabs_combine_v2f16(ptr addrspace(1) %arg) { +; GCN-LABEL: define void @fabs_combine_v2f16( +; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GCN-NEXT: [[BB:.*:]] +; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP0]]) +; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -252,9 +406,18 @@ bb: ret void } -; GCN-LABEL: @fneg_combine_v2f16 -; GCN: fneg <2 x half> define void @fneg_combine_v2f16(ptr addrspace(1) %arg) { +; GCN-LABEL: define void @fneg_combine_v2f16( +; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GCN-NEXT: [[BB:.*:]] +; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = fneg <2 x half> [[TMP0]] +; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -270,11 +433,36 @@ bb: ret void } -; GCN-LABEL: @copysign_combine_v2f16 -; GFX8: call half @llvm.copysign.f16( -; GFX8: call half @llvm.copysign.f16( -; GFX9: call <2 x half> @llvm.copysign.v2f16( define void @copysign_combine_v2f16(ptr addrspace(1) %arg, half %sign) { +; GFX8-LABEL: define void @copysign_combine_v2f16( +; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] { +; GFX8-NEXT: [[BB:.*:]] +; GFX8-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GFX8-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GFX8-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GFX8-NEXT: [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2 +; GFX8-NEXT: [[ITMP4:%.*]] = call half @llvm.copysign.f16(half [[ITMP3]], half [[SIGN]]) +; GFX8-NEXT: store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2 +; GFX8-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1 +; GFX8-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]] +; GFX8-NEXT: [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2 +; GFX8-NEXT: [[ITMP8:%.*]] = call half @llvm.copysign.f16(half [[ITMP7]], half [[SIGN]]) +; GFX8-NEXT: store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2 +; GFX8-NEXT: ret void +; +; GFX9-LABEL: define void @copysign_combine_v2f16( +; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: [[BB:.*:]] +; GFX9-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GFX9-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GFX9-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GFX9-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2 +; GFX9-NEXT: [[TMP1:%.*]] = insertelement <2 x half> poison, half [[SIGN]], i32 0 +; GFX9-NEXT: [[TMP2:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <2 x i32> zeroinitializer +; GFX9-NEXT: [[TMP3:%.*]] = call <2 x half> @llvm.copysign.v2f16(<2 x half> [[TMP0]], <2 x half> [[TMP2]]) +; GFX9-NEXT: store <2 x half> [[TMP3]], ptr addrspace(1) [[ITMP2]], align 2 +; GFX9-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -291,12 +479,59 @@ bb: } ; FIXME: Should always vectorize -; GCN-LABEL: @copysign_combine_v4f16 -; GFX8: call half @llvm.copysign.f16( -; GFX8: call half @llvm.copysign.f16( -; GFX9: call <2 x half> @llvm.copysign.v2f16( define void @copysign_combine_v4f16(ptr addrspace(1) %arg, half %sign) { +; GFX8-LABEL: define void @copysign_combine_v4f16( +; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] { +; GFX8-NEXT: [[BB:.*:]] +; GFX8-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GFX8-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GFX8-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GFX8-NEXT: [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2 +; GFX8-NEXT: [[ITMP4:%.*]] = call half @llvm.copysign.f16(half [[ITMP3]], half [[SIGN]]) +; GFX8-NEXT: store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2 +; GFX8-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1 +; GFX8-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]] +; GFX8-NEXT: [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2 +; GFX8-NEXT: [[ITMP8:%.*]] = call half @llvm.copysign.f16(half [[ITMP7]], half [[SIGN]]) +; GFX8-NEXT: store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2 +; GFX8-NEXT: [[ITMP9:%.*]] = add nuw nsw i64 [[ITMP1]], 2 +; GFX8-NEXT: [[ITMP10:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP9]] +; GFX8-NEXT: [[ITMP11:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2 +; GFX8-NEXT: [[ITMP12:%.*]] = call half @llvm.copysign.f16(half [[ITMP11]], half [[SIGN]]) +; GFX8-NEXT: store half [[ITMP12]], ptr addrspace(1) [[ITMP10]], align 2 +; GFX8-NEXT: [[ITMP13:%.*]] = add nuw nsw i64 [[ITMP1]], 3 +; GFX8-NEXT: [[ITMP14:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP13]] +; GFX8-NEXT: [[ITMP15:%.*]] = load half, ptr addrspace(1) [[ITMP14]], align 2 +; GFX8-NEXT: [[ITMP16:%.*]] = call half @llvm.copysign.f16(half [[ITMP15]], half [[SIGN]]) +; GFX8-NEXT: store half [[ITMP16]], ptr addrspace(1) [[ITMP14]], align 2 +; GFX8-NEXT: ret void +; +; GFX9-LABEL: define void @copysign_combine_v4f16( +; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: [[BB:.*:]] +; GFX9-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GFX9-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GFX9-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GFX9-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1 +; GFX9-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]] +; GFX9-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2 +; GFX9-NEXT: [[TMP1:%.*]] = insertelement <2 x half> poison, half [[SIGN]], i32 0 +; GFX9-NEXT: [[TMP2:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <2 x i32> zeroinitializer +; GFX9-NEXT: [[TMP3:%.*]] = call <2 x half> @llvm.copysign.v2f16(<2 x half> [[TMP0]], <2 x half> [[TMP2]]) +; GFX9-NEXT: store <2 x half> [[TMP3]], ptr addrspace(1) [[ITMP2]], align 2 +; GFX9-NEXT: [[ITMP9:%.*]] = add nuw nsw i64 [[ITMP1]], 2 +; GFX9-NEXT: [[ITMP10:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP9]] +; GFX9-NEXT: [[ITMP11:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2 +; GFX9-NEXT: [[ITMP13:%.*]] = add nuw nsw i64 [[ITMP1]], 3 +; GFX9-NEXT: [[ITMP14:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP13]] +; GFX9-NEXT: [[ITMP15:%.*]] = load half, ptr addrspace(1) [[ITMP14]], align 2 +; GFX9-NEXT: [[TMP4:%.*]] = insertelement <2 x half> poison, half [[ITMP11]], i32 0 +; GFX9-NEXT: [[TMP5:%.*]] = insertelement <2 x half> [[TMP4]], half [[ITMP15]], i32 1 +; GFX9-NEXT: [[TMP6:%.*]] = call <2 x half> @llvm.copysign.v2f16(<2 x half> [[TMP5]], <2 x half> [[TMP2]]) +; GFX9-NEXT: store <2 x half> [[TMP6]], ptr addrspace(1) [[ITMP10]], align 2 +; GFX9-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -326,12 +561,54 @@ bb: ret void } -; GCN-LABEL: @canonicalize_combine_v4f16 -; GFX8: call half @llvm.canonicalize.f16( -; GFX8: call half @llvm.canonicalize.f16( -; GFX9: call <2 x half> @llvm.canonicalize.v2f16( define void @canonicalize_combine_v4f16(ptr addrspace(1) %arg) { +; GFX8-LABEL: define void @canonicalize_combine_v4f16( +; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GFX8-NEXT: [[BB:.*:]] +; GFX8-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GFX8-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GFX8-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GFX8-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1 +; GFX8-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]] +; GFX8-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2 +; GFX8-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP0]]) +; GFX8-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2 +; GFX8-NEXT: [[ITMP9:%.*]] = add nuw nsw i64 [[ITMP1]], 2 +; GFX8-NEXT: [[ITMP10:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP9]] +; GFX8-NEXT: [[ITMP11:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2 +; GFX8-NEXT: [[ITMP12:%.*]] = call half @llvm.canonicalize.f16(half [[ITMP11]]) +; GFX8-NEXT: store half [[ITMP12]], ptr addrspace(1) [[ITMP10]], align 2 +; GFX8-NEXT: [[ITMP13:%.*]] = add nuw nsw i64 [[ITMP1]], 3 +; GFX8-NEXT: [[ITMP14:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP13]] +; GFX8-NEXT: [[ITMP15:%.*]] = load half, ptr addrspace(1) [[ITMP14]], align 2 +; GFX8-NEXT: [[ITMP16:%.*]] = call half @llvm.canonicalize.f16(half [[ITMP15]]) +; GFX8-NEXT: store half [[ITMP16]], ptr addrspace(1) [[ITMP14]], align 2 +; GFX8-NEXT: ret void +; +; GFX9-LABEL: define void @canonicalize_combine_v4f16( +; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GFX9-NEXT: [[BB:.*:]] +; GFX9-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GFX9-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GFX9-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GFX9-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1 +; GFX9-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]] +; GFX9-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2 +; GFX9-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP0]]) +; GFX9-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2 +; GFX9-NEXT: [[ITMP9:%.*]] = add nuw nsw i64 [[ITMP1]], 2 +; GFX9-NEXT: [[ITMP10:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP9]] +; GFX9-NEXT: [[ITMP11:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2 +; GFX9-NEXT: [[ITMP13:%.*]] = add nuw nsw i64 [[ITMP1]], 3 +; GFX9-NEXT: [[ITMP14:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP13]] +; GFX9-NEXT: [[ITMP15:%.*]] = load half, ptr addrspace(1) [[ITMP14]], align 2 +; GFX9-NEXT: [[TMP2:%.*]] = insertelement <2 x half> poison, half [[ITMP11]], i32 0 +; GFX9-NEXT: [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[ITMP15]], i32 1 +; GFX9-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP3]]) +; GFX9-NEXT: store <2 x half> [[TMP4]], ptr addrspace(1) [[ITMP10]], align 2 +; GFX9-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -362,10 +639,18 @@ bb: } ; FIXME: Should not vectorize on gfx8 -; GCN-LABEL: @minimumnum_combine_v2f16 -; GFX8: call <2 x half> @llvm.minimumnum.v2f16 -; GFX9: call <2 x half> @llvm.minimumnum.v2f16 define void @minimumnum_combine_v2f16(ptr addrspace(1) %arg) { +; GCN-LABEL: define void @minimumnum_combine_v2f16( +; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GCN-NEXT: [[BB:.*:]] +; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00)) +; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64 @@ -382,10 +667,18 @@ bb: } ; FIXME: Should not vectorize on gfx8 -; GCN-LABEL: @maximumnum_combine_v2f16 -; GFX8: call <2 x half> @llvm.maximumnum.v2f16 -; GFX9: call <2 x half> @llvm.maximumnum.v2f16 define void @maximumnum_combine_v2f16(ptr addrspace(1) %arg) { +; GCN-LABEL: define void @maximumnum_combine_v2f16( +; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] { +; GCN-NEXT: [[BB:.*:]] +; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64 +; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]] +; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00)) +; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2 +; GCN-NEXT: ret void +; bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = zext i32 %tmp to i64