diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp index 8d01b7dbd15f6..224a182ed17d1 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp @@ -100,6 +100,44 @@ static mlir::Value emitX86MaskAddLogic(CIRGenBuilderTy &builder, return builder.createBitcast(resVec, ops[0].getType()); } +static mlir::Value emitX86MaskUnpack(CIRGenBuilderTy &builder, + mlir::Location loc, + const std::string &intrinsicName, + SmallVectorImpl &ops) { + unsigned numElems = cast(ops[0].getType()).getWidth(); + + // Convert both operands to mask vectors. + mlir::Value lhs = getMaskVecValue(builder, loc, ops[0], numElems); + mlir::Value rhs = getMaskVecValue(builder, loc, ops[1], numElems); + + mlir::Type i32Ty = builder.getSInt32Ty(); + + // Create indices for extracting the first half of each vector. + SmallVector halfIndices; + for (auto i : llvm::seq(0, numElems / 2)) + halfIndices.push_back(cir::IntAttr::get(i32Ty, i)); + + // Extract first half of each vector. This gives better codegen than + // doing it in a single shuffle. + mlir::Value lhsHalf = builder.createVecShuffle(loc, lhs, lhs, halfIndices); + mlir::Value rhsHalf = builder.createVecShuffle(loc, rhs, rhs, halfIndices); + + // Create indices for concatenating the vectors. + // NOTE: Operands are swapped to match the intrinsic definition. + // After the half extraction, both vectors have numElems/2 elements. + // In createVecShuffle(rhsHalf, lhsHalf, indices), indices [0..numElems/2-1] + // select from rhsHalf, and indices [numElems/2..numElems-1] select from + // lhsHalf. + SmallVector concatIndices; + for (auto i : llvm::seq(0, numElems)) + concatIndices.push_back(cir::IntAttr::get(i32Ty, i)); + + // Concat the vectors (RHS first, then LHS). + mlir::Value res = + builder.createVecShuffle(loc, rhsHalf, lhsHalf, concatIndices); + return builder.createBitcast(res, ops[0].getType()); +} + static mlir::Value emitX86MaskLogic(CIRGenBuilderTy &builder, mlir::Location loc, cir::BinOpKind binOpKind, @@ -257,7 +295,15 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, return emitVecInsert(builder, getLoc(expr->getExprLoc()), ops[0], ops[1], ops[2]); } - + case X86::BI__builtin_ia32_kunpckhi: + return emitX86MaskUnpack(builder, getLoc(expr->getExprLoc()), + "x86.avx512.kunpackb", ops); + case X86::BI__builtin_ia32_kunpcksi: + return emitX86MaskUnpack(builder, getLoc(expr->getExprLoc()), + "x86.avx512.kunpackw", ops); + case X86::BI__builtin_ia32_kunpckdi: + return emitX86MaskUnpack(builder, getLoc(expr->getExprLoc()), + "x86.avx512.kunpackd", ops); case X86::BI_mm_setcsr: case X86::BI__builtin_ia32_ldmxcsr: { mlir::Location loc = getLoc(expr->getExprLoc()); @@ -947,9 +993,6 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, getMaskVecValue(builder, getLoc(expr->getExprLoc()), ops[0], numElts); return builder.createBitcast(resVec, ops[0].getType()); } - case X86::BI__builtin_ia32_kunpckdi: - case X86::BI__builtin_ia32_kunpcksi: - case X86::BI__builtin_ia32_kunpckhi: case X86::BI__builtin_ia32_sqrtsh_round_mask: case X86::BI__builtin_ia32_sqrtsd_round_mask: case X86::BI__builtin_ia32_sqrtss_round_mask: diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512bw-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512bw-builtins.c index 4863ba0bd8848..774e1452d10fa 100644 --- a/clang/test/CIR/CodeGenBuiltins/X86/avx512bw-builtins.c +++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512bw-builtins.c @@ -465,3 +465,57 @@ __mmask64 test_kmov_q(__mmask64 A) { return __builtin_ia32_kmovq(A); } + +__mmask32 test_mm512_kunpackw(__mmask32 A, __mmask32 B) { + // CIR-LABEL: _mm512_kunpackw + // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int> + // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int> + // CIR: cir.vec.shuffle + // CIR: cir.vec.shuffle + // CIR: cir.vec.shuffle + // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int> -> !u32i + + // LLVM-LABEL: _mm512_kunpackw + // LLVM: [[A_VEC:%.*]] = bitcast i32 %{{.*}} to <32 x i1> + // LLVM: [[B_VEC:%.*]] = bitcast i32 %{{.*}} to <32 x i1> + // LLVM: [[A_HALF:%.*]] = shufflevector <32 x i1> [[A_VEC]], <32 x i1> [[A_VEC]], <16 x i32> + // LLVM: [[B_HALF:%.*]] = shufflevector <32 x i1> [[B_VEC]], <32 x i1> [[B_VEC]], <16 x i32> + // LLVM: [[RES:%.*]] = shufflevector <16 x i1> [[B_HALF]], <16 x i1> [[A_HALF]], <32 x i32> + // LLVM: bitcast <32 x i1> [[RES]] to i32 + + // OGCG-LABEL: _mm512_kunpackw + // OGCG: [[A_VEC:%.*]] = bitcast i32 %{{.*}} to <32 x i1> + // OGCG: [[B_VEC:%.*]] = bitcast i32 %{{.*}} to <32 x i1> + // OGCG: [[A_HALF:%.*]] = shufflevector <32 x i1> [[A_VEC]], <32 x i1> [[A_VEC]], <16 x i32> + // OGCG: [[B_HALF:%.*]] = shufflevector <32 x i1> [[B_VEC]], <32 x i1> [[B_VEC]], <16 x i32> + // OGCG: [[RES:%.*]] = shufflevector <16 x i1> [[B_HALF]], <16 x i1> [[A_HALF]], <32 x i32> + // OGCG: bitcast <32 x i1> [[RES]] to i32 + return _mm512_kunpackw(A, B); +} + +__mmask64 test_mm512_kunpackd(__mmask64 A, __mmask64 B) { + // CIR-LABEL: _mm512_kunpackd + // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int> + // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int> + // CIR: cir.vec.shuffle + // CIR: cir.vec.shuffle + // CIR: cir.vec.shuffle + // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int> -> !u64i + + // LLVM-LABEL: _mm512_kunpackd + // LLVM: [[A_VEC:%.*]] = bitcast i64 %{{.*}} to <64 x i1> + // LLVM: [[B_VEC:%.*]] = bitcast i64 %{{.*}} to <64 x i1> + // LLVM: [[A_HALF:%.*]] = shufflevector <64 x i1> [[A_VEC]], <64 x i1> [[A_VEC]], <32 x i32> + // LLVM: [[B_HALF:%.*]] = shufflevector <64 x i1> [[B_VEC]], <64 x i1> [[B_VEC]], <32 x i32> + // LLVM: [[RES:%.*]] = shufflevector <32 x i1> [[B_HALF]], <32 x i1> [[A_HALF]], <64 x i32> + // LLVM: bitcast <64 x i1> [[RES]] to i64 + + // OGCG-LABEL: _mm512_kunpackd + // OGCG: [[A_VEC:%.*]] = bitcast i64 %{{.*}} to <64 x i1> + // OGCG: [[B_VEC:%.*]] = bitcast i64 %{{.*}} to <64 x i1> + // OGCG: [[A_HALF:%.*]] = shufflevector <64 x i1> [[A_VEC]], <64 x i1> [[A_VEC]], <32 x i32> + // OGCG: [[B_HALF:%.*]] = shufflevector <64 x i1> [[B_VEC]], <64 x i1> [[B_VEC]], <32 x i32> + // OGCG: [[RES:%.*]] = shufflevector <32 x i1> [[B_HALF]], <32 x i1> [[A_HALF]], <64 x i32> + // OGCG: bitcast <64 x i1> [[RES]] to i64 + return _mm512_kunpackd(A, B); +} diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c index 9d957f5de554d..e03109510a931 100644 --- a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c +++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c @@ -228,6 +228,33 @@ __mmask16 test_kmov_w(__mmask16 A) { // OGCG: bitcast <16 x i1> {{.*}} to i16 return __builtin_ia32_kmovw(A); } + +__mmask16 test_mm512_kunpackb(__mmask16 A, __mmask16 B) { + // CIR-LABEL: _mm512_kunpackb + // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int> + // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int> + // CIR: cir.vec.shuffle + // CIR: cir.vec.shuffle + // CIR: cir.vec.shuffle + // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int> -> !u16i + + // LLVM-LABEL: _mm512_kunpackb + // LLVM: [[A_VEC:%.*]] = bitcast i16 %{{.*}} to <16 x i1> + // LLVM: [[B_VEC:%.*]] = bitcast i16 %{{.*}} to <16 x i1> + // LLVM: [[A_HALF:%.*]] = shufflevector <16 x i1> [[A_VEC]], <16 x i1> [[A_VEC]], <8 x i32> + // LLVM: [[B_HALF:%.*]] = shufflevector <16 x i1> [[B_VEC]], <16 x i1> [[B_VEC]], <8 x i32> + // LLVM: [[RES:%.*]] = shufflevector <8 x i1> [[B_HALF]], <8 x i1> [[A_HALF]], <16 x i32> + // LLVM: bitcast <16 x i1> [[RES]] to i16 + + // OGCG-LABEL: _mm512_kunpackb + // OGCG: [[A_VEC:%.*]] = bitcast i16 %{{.*}} to <16 x i1> + // OGCG: [[B_VEC:%.*]] = bitcast i16 %{{.*}} to <16 x i1> + // OGCG: [[A_HALF:%.*]] = shufflevector <16 x i1> [[A_VEC]], <16 x i1> [[A_VEC]], <8 x i32> + // OGCG: [[B_HALF:%.*]] = shufflevector <16 x i1> [[B_VEC]], <16 x i1> [[B_VEC]], <8 x i32> + // OGCG: [[RES:%.*]] = shufflevector <8 x i1> [[B_HALF]], <8 x i1> [[A_HALF]], <16 x i32> + // OGCG: bitcast <16 x i1> [[RES]] to i16 + return _mm512_kunpackb(A, B); +} __m256 test_mm512_i64gather_ps(__m512i __index, void const *__addr) { // CIR-LABEL: test_mm512_i64gather_ps // CIR: cir.call_llvm_intrinsic "x86.avx512.mask.gather.qps.512"