Skip to content

Commit

Permalink
[x86] Lowering Mask Set1 intrinsics to LLVM IR
Browse files Browse the repository at this point in the history
This patch, together with a matching clang patch (https://reviews.llvm.org/D37668), implements the lowering of X86 mask set1 intrinsics to IR.

Differential Revision: https://reviews.llvm.org/D37669

llvm-svn: 313625
  • Loading branch information
Jina Nahias committed Sep 19, 2017
1 parent 3ad702a commit ccfb8d4
Show file tree
Hide file tree
Showing 15 changed files with 2,282 additions and 345 deletions.
59 changes: 0 additions & 59 deletions llvm/include/llvm/IR/IntrinsicsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -1871,65 +1871,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
llvm_v32i8_ty], [IntrNoMem]>;
}

// Vector load with broadcast
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx512_mask_pbroadcast_b_gpr_128 :
GCCBuiltin<"__builtin_ia32_pbroadcastb128_gpr_mask">,
Intrinsic<[llvm_v16i8_ty],
[llvm_i8_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
def int_x86_avx512_mask_pbroadcast_b_gpr_256 :
GCCBuiltin<"__builtin_ia32_pbroadcastb256_gpr_mask">,
Intrinsic<[llvm_v32i8_ty],
[llvm_i8_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_pbroadcast_b_gpr_512 :
GCCBuiltin<"__builtin_ia32_pbroadcastb512_gpr_mask">,
Intrinsic<[llvm_v64i8_ty],
[llvm_i8_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;

def int_x86_avx512_mask_pbroadcast_w_gpr_128 :
GCCBuiltin<"__builtin_ia32_pbroadcastw128_gpr_mask">,
Intrinsic<[llvm_v8i16_ty],
[llvm_i16_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_pbroadcast_w_gpr_256 :
GCCBuiltin<"__builtin_ia32_pbroadcastw256_gpr_mask">,
Intrinsic<[llvm_v16i16_ty],
[llvm_i16_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
def int_x86_avx512_mask_pbroadcast_w_gpr_512 :
GCCBuiltin<"__builtin_ia32_pbroadcastw512_gpr_mask">,
Intrinsic<[llvm_v32i16_ty],
[llvm_i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;

def int_x86_avx512_mask_pbroadcast_d_gpr_128 :
GCCBuiltin<"__builtin_ia32_pbroadcastd128_gpr_mask">,
Intrinsic<[llvm_v4i32_ty],
[llvm_i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_pbroadcast_d_gpr_256 :
GCCBuiltin<"__builtin_ia32_pbroadcastd256_gpr_mask">,
Intrinsic<[llvm_v8i32_ty],
[llvm_i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_pbroadcast_d_gpr_512 :
GCCBuiltin<"__builtin_ia32_pbroadcastd512_gpr_mask">,
Intrinsic<[llvm_v16i32_ty],
[llvm_i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;

def int_x86_avx512_mask_pbroadcast_q_gpr_128 :
GCCBuiltin<"__builtin_ia32_pbroadcastq128_gpr_mask">,
Intrinsic<[llvm_v2i64_ty],
[llvm_i64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_pbroadcast_q_gpr_256 :
GCCBuiltin<"__builtin_ia32_pbroadcastq256_gpr_mask">,
Intrinsic<[llvm_v4i64_ty],
[llvm_i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_pbroadcast_q_gpr_512 :
GCCBuiltin<"__builtin_ia32_pbroadcastq512_gpr_mask">,
Intrinsic<[llvm_v8i64_ty],
[llvm_i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;

def int_x86_avx512_mask_pbroadcast_q_mem_512 :
GCCBuiltin<"__builtin_ia32_pbroadcastq512_mem_mask">,
Intrinsic<[llvm_v8i64_ty],
[llvm_i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
}

// Vector permutation
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/IR/AutoUpgrade.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
Name=="ssse3.pabs.d.128" || // Added in 6.0
Name.startswith("avx2.pabs.") || // Added in 6.0
Name.startswith("avx512.mask.pabs.") || // Added in 6.0
Name.startswith("avx512.mask.pbroadcast") || // Added in 6.0
Name.startswith("sse2.pcmpeq.") || // Added in 3.1
Name.startswith("sse2.pcmpgt.") || // Added in 3.1
Name.startswith("avx2.pcmpeq.") || // Added in 3.1
Expand Down Expand Up @@ -1031,6 +1032,12 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
Rep = Builder.CreateICmp(CmpEq ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_SGT,
CI->getArgOperand(0), CI->getArgOperand(1));
Rep = Builder.CreateSExt(Rep, CI->getType(), "");
} else if (IsX86 && (Name.startswith("avx512.mask.pbroadcast"))){
unsigned NumElts =
CI->getArgOperand(1)->getType()->getVectorNumElements();
Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
CI->getArgOperand(1));
} else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) {
Type *I32Ty = Type::getInt32Ty(C);
Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0),
Expand Down
24 changes: 0 additions & 24 deletions llvm/lib/Target/X86/X86IntrinsicsInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -797,30 +797,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_128, INTR_TYPE_1OP_MASK,
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_256, INTR_TYPE_1OP_MASK,
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_512, INTR_TYPE_1OP_MASK,
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_128, INTR_TYPE_1OP_MASK,
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_256, INTR_TYPE_1OP_MASK,
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_512, INTR_TYPE_1OP_MASK,
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_128, INTR_TYPE_1OP_MASK,
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_256, INTR_TYPE_1OP_MASK,
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_512, INTR_TYPE_1OP_MASK,
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_128, INTR_TYPE_1OP_MASK,
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_256, INTR_TYPE_1OP_MASK,
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_512, INTR_TYPE_1OP_MASK,
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_permvar_df_256, VPERM_2OP_MASK,
X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx512_mask_permvar_df_512, VPERM_2OP_MASK,
Expand Down
104 changes: 104 additions & 0 deletions llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,110 @@

; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c

define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) {
; X32-LABEL: test_mm512_mask_set1_epi32:
; X32: # BB#0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: kmovw %ecx, %k1
; X32-NEXT: vpbroadcastd %eax, %zmm0 {%k1}
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_set1_epi32:
; X64: # BB#0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1}
; X64-NEXT: retq
entry:
%vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
%vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
%0 = bitcast <8 x i64> %__O to <16 x i32>
%1 = bitcast i16 %__M to <16 x i1>
%2 = select <16 x i1> %1, <16 x i32> %vecinit15.i.i, <16 x i32> %0
%3 = bitcast <16 x i32> %2 to <8 x i64>
ret <8 x i64> %3
}

define <8 x i64> @test_mm512_maskz_set1_epi32(i16 zeroext %__M, i32 %__A) {
; X32-LABEL: test_mm512_maskz_set1_epi32:
; X32: # BB#0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: kmovw %ecx, %k1
; X32-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_set1_epi32:
; X64: # BB#0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1} {z}
; X64-NEXT: retq
entry:
%vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
%vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
%0 = bitcast i16 %__M to <16 x i1>
%1 = select <16 x i1> %0, <16 x i32> %vecinit15.i.i, <16 x i32> zeroinitializer
%2 = bitcast <16 x i32> %1 to <8 x i64>
ret <8 x i64> %2
}

define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) {
; X32-LABEL: test_mm512_mask_set1_epi64:
; X32: # BB#0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: vmovd %edx, %xmm1
; X32-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
; X32-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
; X32-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1
; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm0 {%k1}
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_set1_epi64:
; X64: # BB#0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1}
; X64-NEXT: retq
entry:
%vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
%vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
%0 = bitcast i8 %__M to <8 x i1>
%1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> %__O
ret <8 x i64> %1
}

define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) {
; X32-LABEL: test_mm512_maskz_set1_epi64:
; X32: # BB#0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: vmovd %edx, %xmm0
; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; X32-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
; X32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_set1_epi64:
; X64: # BB#0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1} {z}
; X64-NEXT: retq
entry:
%vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
%vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
%0 = bitcast i8 %__M to <8 x i1>
%1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> zeroinitializer
ret <8 x i64> %1
}


define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) {
; X32-LABEL: test_mm512_broadcastd_epi32:
; X32: # BB#0:
Expand Down
40 changes: 40 additions & 0 deletions llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,46 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s

define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vpbroadcastd %edi, %zmm1
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1}
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpbroadcastd %edi, %zmm1 {%k1} {z}
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask)
%res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask)
%res3 = add <16 x i32> %res, %res1
%res4 = add <16 x i32> %res2, %res3
ret <16 x i32> %res4
}
declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16)


define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vpbroadcastq %rdi, %zmm1
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1}
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask)
%res3 = add <8 x i64> %res, %res1
%res4 = add <8 x i64> %res2, %res3
ret <8 x i64> %res4
}
declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8)


declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly

define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) {
Expand Down
36 changes: 0 additions & 36 deletions llvm/test/CodeGen/X86/avx512-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4171,44 +4171,8 @@ define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2
ret i8 %res2
}

define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpbroadcastd %edi, %zmm1 {%k1} {z}
; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1}
; CHECK-NEXT: vpbroadcastd %edi, %zmm2
; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask)
%res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask)
%res3 = add <16 x i32> %res, %res1
%res4 = add <16 x i32> %res2, %res3
ret <16 x i32> %res4
}

declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16)

define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z}
; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1}
; CHECK-NEXT: vpbroadcastq %rdi, %zmm2
; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask)
%res3 = add <8 x i64> %res, %res1
%res4 = add <8 x i64> %res2, %res3
ret <8 x i64> %res4
}
declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8)

declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)

Expand Down

0 comments on commit ccfb8d4

Please sign in to comment.