From 1c8b6a0b7029d83a4137bb6ddada7c5d3a0bf760 Mon Sep 17 00:00:00 2001 From: generaluseai Date: Fri, 28 Nov 2025 03:05:24 +0800 Subject: [PATCH] [CIR][X86] Implement lowering for pmuldq / pmuludq builtins This patch adds CIR codegen support for X86 pmuldq and pmuludq operations, covering the signed and unsigned variants across all supported vector widths. The builtins now lower to the expected CIR representation matching the semantics of the corresponding LLVM intrinsics. --- clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp | 52 +++++++++++++- .../CIR/CodeGenBuiltins/X86/avx2-builtins.c | 68 +++++++++++++++++++ .../CodeGenBuiltins/X86/avx512f-builtins.c | 53 +++++++++++++++ .../CIR/CodeGenBuiltins/X86/sse2-builtins.c | 23 +++++++ .../CIR/CodeGenBuiltins/X86/sse41-builtins.c | 45 ++++++++++++ 5 files changed, 239 insertions(+), 2 deletions(-) create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c mode change 100644 => 100755 clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/sse41-builtins.c diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp index 8d01b7dbd15f62..c0080bb8f6d59a 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp @@ -130,6 +130,40 @@ static mlir::Value emitVecInsert(CIRGenBuilderTy &builder, mlir::Location loc, return cir::VecInsertOp::create(builder, loc, vec, value, indexVal); } +static mlir::Value emitX86Muldq(CIRGenBuilderTy &builder, mlir::Location loc, + bool isSigned, + SmallVectorImpl &ops, + unsigned opTypePrimitiveSizeInBits) { + mlir::Type ty = cir::VectorType::get(builder.getSInt64Ty(), + opTypePrimitiveSizeInBits / 64); + mlir::Value lhs = builder.createBitcast(loc, ops[0], ty); + mlir::Value rhs = builder.createBitcast(loc, ops[1], ty); + if (isSigned) { + cir::ConstantOp shiftAmt = + builder.getConstant(loc, cir::IntAttr::get(builder.getSInt64Ty(), 32)); + cir::VecSplatOp shiftSplatVecOp = + cir::VecSplatOp::create(builder, loc, ty, shiftAmt.getResult()); + mlir::Value shiftSplatValue = shiftSplatVecOp.getResult(); + // In CIR, right-shift operations are automatically lowered to either an + // arithmetic or logical shift depending on the operand type. The purpose + // of the shifts here is to propagate the sign bit of the 32-bit input + // into the upper bits of each vector lane. + lhs = builder.createShift(loc, lhs, shiftSplatValue, true); + lhs = builder.createShift(loc, lhs, shiftSplatValue, false); + rhs = builder.createShift(loc, rhs, shiftSplatValue, true); + rhs = builder.createShift(loc, rhs, shiftSplatValue, false); + } else { + cir::ConstantOp maskScalar = builder.getConstant( + loc, cir::IntAttr::get(builder.getSInt64Ty(), 0xffffffff)); + cir::VecSplatOp mask = + cir::VecSplatOp::create(builder, loc, ty, maskScalar.getResult()); + // Clear the upper bits + lhs = builder.createAnd(loc, lhs, mask); + rhs = builder.createAnd(loc, rhs, mask); + } + return builder.createMul(loc, lhs, rhs); +} + mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) { if (builtinID == Builtin::BI__builtin_cpu_is) { @@ -956,12 +990,26 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_sqrtph512: case X86::BI__builtin_ia32_sqrtps512: case X86::BI__builtin_ia32_sqrtpd512: + cgm.errorNYI(expr->getSourceRange(), + std::string("unimplemented X86 builtin call: ") + + getContext().BuiltinInfo.getName(builtinID)); + return {}; case X86::BI__builtin_ia32_pmuludq128: case X86::BI__builtin_ia32_pmuludq256: - case X86::BI__builtin_ia32_pmuludq512: + case X86::BI__builtin_ia32_pmuludq512: { + unsigned opTypePrimitiveSizeInBits = + cgm.getDataLayout().getTypeSizeInBits(ops[0].getType()); + return emitX86Muldq(builder, getLoc(expr->getExprLoc()), /*isSigned*/ false, + ops, opTypePrimitiveSizeInBits); + } case X86::BI__builtin_ia32_pmuldq128: case X86::BI__builtin_ia32_pmuldq256: - case X86::BI__builtin_ia32_pmuldq512: + case X86::BI__builtin_ia32_pmuldq512: { + unsigned opTypePrimitiveSizeInBits = + cgm.getDataLayout().getTypeSizeInBits(ops[0].getType()); + return emitX86Muldq(builder, getLoc(expr->getExprLoc()), /*isSigned*/ true, + ops, opTypePrimitiveSizeInBits); + } case X86::BI__builtin_ia32_pternlogd512_mask: case X86::BI__builtin_ia32_pternlogq512_mask: case X86::BI__builtin_ia32_pternlogd128_mask: diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c new file mode 100644 index 00000000000000..2152dd50e79343 --- /dev/null +++ b/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c @@ -0,0 +1,68 @@ +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion +// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s + +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion +// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s + +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx2 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx2 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG + +#include + +__m256i test_mm256_mul_epu32(__m256i a, __m256i b) { + // CIR-LABEL: _mm256_mul_epu32 + // CIR: [[BC_A:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i> + // CIR: [[BC_B:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i> + // CIR: [[MASK_SCALAR:%.*]] = cir.const #cir.int<4294967295> : !s64i + // CIR: [[MASK_VEC:%.*]] = cir.vec.splat [[MASK_SCALAR]] : !s64i, !cir.vector<4 x !s64i> + // CIR: [[AND_A:%.*]] = cir.binop(and, [[BC_A]], [[MASK_VEC]]) + // CIR: [[AND_B:%.*]] = cir.binop(and, [[BC_B]], [[MASK_VEC]]) + // CIR: [[MUL:%.*]] = cir.binop(mul, [[AND_A]], [[AND_B]]) + + // LLVM-LABEL: _mm256_mul_epu32 + // LLVM: and <4 x i64> %{{.*}}, splat (i64 4294967295) + // LLVM: and <4 x i64> %{{.*}}, splat (i64 4294967295) + // LLVM: mul <4 x i64> %{{.*}}, %{{.*}} + + // OGCG-LABEL: _mm256_mul_epu32 + // OGCG: and <4 x i64> %{{.*}}, splat (i64 4294967295) + // OGCG: and <4 x i64> %{{.*}}, splat (i64 4294967295) + // OGCG: mul <4 x i64> %{{.*}}, %{{.*}} + +return _mm256_mul_epu32(a, b); +} + +__m256i test_mm256_mul_epi32(__m256i a, __m256i b) { + // CIR-LABEL: _mm256_mul_epi32 + // CIR: [[A64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i> + // CIR: [[B64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i> + // CIR: [[SC:%.*]] = cir.const #cir.int<32> : !s64i + // CIR: [[SV:%.*]] = cir.vec.splat [[SC]] : !s64i, !cir.vector<4 x !s64i> + // CIR: [[SHL_A:%.*]] = cir.shift(left, [[A64]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>) + // CIR: [[ASHR_A:%.*]] = cir.shift(right, [[SHL_A]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>) + // CIR: [[SHL_B:%.*]] = cir.shift(left, [[B64]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>) + // CIR: [[ASHR_B:%.*]] = cir.shift(right, [[SHL_B]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>) + // CIR: [[MUL:%.*]] = cir.binop(mul, [[ASHR_A]], [[ASHR_B]]) + + // LLVM-LABEL: _mm256_mul_epi32 + // LLVM: shl <4 x i64> %{{.*}}, splat (i64 32) + // LLVM: ashr <4 x i64> %{{.*}}, splat (i64 32) + // LLVM: shl <4 x i64> %{{.*}}, splat (i64 32) + // LLVM: ashr <4 x i64> %{{.*}}, splat (i64 32) + // LLVM: mul <4 x i64> %{{.*}}, %{{.*}} + + // OGCG-LABEL: _mm256_mul_epi32 + // OGCG: shl <4 x i64> %{{.*}}, splat (i64 32) + // OGCG: ashr <4 x i64> %{{.*}}, splat (i64 32) + // OGCG: shl <4 x i64> %{{.*}}, splat (i64 32) + // OGCG: ashr <4 x i64> %{{.*}}, splat (i64 32) + // OGCG: mul <4 x i64> %{{.*}}, %{{.*}} + + return _mm256_mul_epi32(a, b); +} diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c old mode 100644 new mode 100755 index 9d957f5de554d3..a7d5a19d438b21 --- a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c +++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c @@ -419,3 +419,56 @@ __m512i test_mm512_mask_i32gather_epi64(__m512i __v1_old, __mmask8 __mask, __m25 // OGCG: call <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512 return _mm512_mask_i32gather_epi64(__v1_old, __mask, __index, __addr, 2); } + +__m512i test_mm512_mul_epi32(__m512i __A, __m512i __B) { + // CIR-LABEL: _mm512_mul_epi32 + // CIR: [[A64:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s32i> -> !cir.vector<8 x !s64i> + // CIR: [[B64:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s32i> -> !cir.vector<8 x !s64i> + // CIR: [[SC:%.*]] = cir.const #cir.int<32> : !s64i + // CIR: [[SV:%.*]] = cir.vec.splat [[SC]] : !s64i, !cir.vector<8 x !s64i> + // CIR: [[SHL_A:%.*]] = cir.shift(left, [[A64]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>) + // CIR: [[ASHR_A:%.*]] = cir.shift(right, [[SHL_A]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>) + // CIR: [[SHL_B:%.*]] = cir.shift(left, [[B64]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>) + // CIR: [[ASHR_B:%.*]] = cir.shift(right, [[SHL_B]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>) + // CIR: [[MUL:%.*]] = cir.binop(mul, [[ASHR_A]], [[ASHR_B]]) + + // LLVM-LABEL: _mm512_mul_epi32 + // LLVM: shl <8 x i64> %{{.*}}, splat (i64 32) + // LLVM: ashr <8 x i64> %{{.*}}, splat (i64 32) + // LLVM: shl <8 x i64> %{{.*}}, splat (i64 32) + // LLVM: ashr <8 x i64> %{{.*}}, splat (i64 32) + // LLVM: mul <8 x i64> %{{.*}}, %{{.*}} + + // OGCG-LABEL: _mm512_mul_epi32 + // OGCG: shl <8 x i64> %{{.*}}, splat (i64 32) + // OGCG: ashr <8 x i64> %{{.*}}, splat (i64 32) + // OGCG: shl <8 x i64> %{{.*}}, splat (i64 32) + // OGCG: ashr <8 x i64> %{{.*}}, splat (i64 32) + // OGCG: mul <8 x i64> %{{.*}}, %{{.*}} + + return _mm512_mul_epi32(__A, __B); +} + + +__m512i test_mm512_mul_epu32(__m512i __A, __m512i __B) { +// CIR-LABEL: _mm512_mul_epu32 +// CIR: [[BC_A:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<8 x !s64i> +// CIR: [[BC_B:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<8 x !s64i> +// CIR: [[MASK_SCALAR:%.*]] = cir.const #cir.int<4294967295> : !s64i +// CIR: [[MASK_VEC:%.*]] = cir.vec.splat [[MASK_SCALAR]] : !s64i, !cir.vector<8 x !s64i> +// CIR: [[AND_A:%.*]] = cir.binop(and, [[BC_A]], [[MASK_VEC]]) +// CIR: [[AND_B:%.*]] = cir.binop(and, [[BC_B]], [[MASK_VEC]]) +// CIR: [[MUL:%.*]] = cir.binop(mul, [[AND_A]], [[AND_B]]) + +// LLVM-LABEL: _mm512_mul_epu32 +// LLVM: and <8 x i64> %{{.*}}, splat (i64 4294967295) +// LLVM: and <8 x i64> %{{.*}}, splat (i64 4294967295) +// LLVM: mul <8 x i64> %{{.*}}, %{{.*}} + +// OGCG-LABEL: _mm512_mul_epu32 +// OGCG: and <8 x i64> %{{.*}}, splat (i64 4294967295) +// OGCG: and <8 x i64> %{{.*}}, splat (i64 4294967295) +// OGCG: mul <8 x i64> %{{.*}}, %{{.*}} + +return _mm512_mul_epu32(__A, __B); +} diff --git a/clang/test/CIR/CodeGenBuiltins/X86/sse2-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/sse2-builtins.c index f5e07cdc28ccd1..46898f01428e3c 100644 --- a/clang/test/CIR/CodeGenBuiltins/X86/sse2-builtins.c +++ b/clang/test/CIR/CodeGenBuiltins/X86/sse2-builtins.c @@ -108,3 +108,26 @@ void test_mm_pause(void) { // LLVM: call void @llvm.x86.sse2.pause() // OGCG: call void @llvm.x86.sse2.pause() } + +__m128i test_mm_mul_epu32(__m128i A, __m128i B) { + // CIR-LABEL: _mm_mul_epu32 + // CIR: [[BC_A:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i> + // CIR: [[BC_B:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i> + // CIR: [[MASK_SCALAR:%.*]] = cir.const #cir.int<4294967295> : !s64i + // CIR: [[MASK_VEC:%.*]] = cir.vec.splat [[MASK_SCALAR]] : !s64i, !cir.vector<2 x !s64i> + // CIR: [[AND_A:%.*]] = cir.binop(and, [[BC_A]], [[MASK_VEC]]) + // CIR: [[AND_B:%.*]] = cir.binop(and, [[BC_B]], [[MASK_VEC]]) + // CIR: [[MUL:%.*]] = cir.binop(mul, [[AND_A]], [[AND_B]]) + + // LLVM-LABEL: _mm_mul_epu32 + // LLVM: and <2 x i64> %{{.*}}, splat (i64 4294967295) + // LLVM: and <2 x i64> %{{.*}}, splat (i64 4294967295) + // LLVM: mul <2 x i64> %{{.*}}, %{{.*}} + + // OGCG-LABEL: _mm_mul_epu32 + // OGCG: and <2 x i64> %{{.*}}, splat (i64 4294967295) + // OGCG: and <2 x i64> %{{.*}}, splat (i64 4294967295) + // OGCG: mul <2 x i64> %{{.*}}, %{{.*}} + + return _mm_mul_epu32(A, B); +} diff --git a/clang/test/CIR/CodeGenBuiltins/X86/sse41-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/sse41-builtins.c new file mode 100644 index 00000000000000..c53d435842b27a --- /dev/null +++ b/clang/test/CIR/CodeGenBuiltins/X86/sse41-builtins.c @@ -0,0 +1,45 @@ +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion +// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s + +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion +// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s + +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG + +#include + +__m128i test_mm_mul_epi32(__m128i x, __m128i y) { + // CIR-LABEL: _mm_mul_epi32 + // CIR: [[A64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i> + // CIR: [[B64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i> + // CIR: [[SC:%.*]] = cir.const #cir.int<32> : !s64i + // CIR: [[SV:%.*]] = cir.vec.splat [[SC]] : !s64i, !cir.vector<2 x !s64i> + // CIR: [[SHL_A:%.*]] = cir.shift(left, [[A64]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>) + // CIR: [[ASHR_A:%.*]] = cir.shift(right, [[SHL_A]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>) + // CIR: [[SHL_B:%.*]] = cir.shift(left, [[B64]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>) + // CIR: [[ASHR_B:%.*]] = cir.shift(right, [[SHL_B]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>) + // CIR: [[MUL:%.*]] = cir.binop(mul, [[ASHR_A]], [[ASHR_B]]) + + // LLVM-LABEL: _mm_mul_epi32 + // LLVM: shl <2 x i64> %{{.*}}, splat (i64 32) + // LLVM: ashr <2 x i64> %{{.*}}, splat (i64 32) + // LLVM: shl <2 x i64> %{{.*}}, splat (i64 32) + // LLVM: ashr <2 x i64> %{{.*}}, splat (i64 32) + // LLVM: mul <2 x i64> %{{.*}}, %{{.*}} + + // OGCG-LABEL: _mm_mul_epi32 + // OGCG: shl <2 x i64> %{{.*}}, splat (i64 32) + // OGCG: ashr <2 x i64> %{{.*}}, splat (i64 32) + // OGCG: shl <2 x i64> %{{.*}}, splat (i64 32) + // OGCG: ashr <2 x i64> %{{.*}}, splat (i64 32) + // OGCG: mul <2 x i64> %{{.*}}, %{{.*}} + + return _mm_mul_epi32(x, y); +}