llvm · GeneraluseAI · Nov 27, 2025 · andykaylor · Dec 1, 2025 · andykaylor
@@ -115,6 +115,47 @@ static mlir::Value emitX86MaskLogic(CIRGenBuilderTy &builder,
                                ops[0].getType());
 }
 
+static mlir::Value emitX86Muldq(CIRGenFunction &cgf, const CallExpr *expr,
-static mlir::Value emitX86Muldq(CIRGenFunction &cgf, const CallExpr *expr,
+static mlir::Value emitX86Muldq(CIRGenBuilderTy &builder, mlir::Location loc,
-static mlir::Value emitX86Muldq(CIRGenFunction &cgf, const CallExpr *expr,
+static mlir::Value emitX86Muldq(CIRGenBuilderTy &builder, mlir::Location loc,
+                                bool isSigned,
+                                SmallVectorImpl<mlir::Value> &ops) {
+  CIRGenBuilderTy &builder = cgf.getBuilder();
+  mlir::Location loc = cgf.getLoc(expr->getExprLoc());
+  mlir::Type ty = ops[0].getType();
+  unsigned tyPrimitiveSizeInBits =
+      cgf.cgm.getDataLayout().getTypeSizeInBits(ty);
-      cgf.cgm.getDataLayout().getTypeSizeInBits(ty);
+  unsigned tyPrimitiveSizeInBits =
+      cgf.cgm.getDataLayout().getTypeSizeInBits(ops[0].getType());
+  // Arguments have a vXi32 type so cast to vXi64.      
+  mlir::Type ty = cir::VectorType::get(builder.getSInt64Ty(), tyPrimitiveSizeInBits / 64);
-      cgf.cgm.getDataLayout().getTypeSizeInBits(ty);
+  unsigned tyPrimitiveSizeInBits =
+      cgf.cgm.getDataLayout().getTypeSizeInBits(ops[0].getType());
+  // Arguments have a vXi32 type so cast to vXi64.      
+  mlir::Type ty = cir::VectorType::get(builder.getSInt64Ty(), tyPrimitiveSizeInBits / 64);
+  mlir::Value lhs, rhs;
-  mlir::Value lhs, rhs;
+  mlir::Value lhs = builder.createBitcast(loc, ops[0], ty);
+  mlir::Value rhs = builder.createBitcast(loc, ops[1], ty);
-  mlir::Value lhs, rhs;
+  mlir::Value lhs = builder.createBitcast(loc, ops[0], ty);
+  mlir::Value rhs = builder.createBitcast(loc, ops[1], ty);
+  // in cir, if a shiftOperation is shift right,it will be translated into Ashr
+  // or lShr automatically in match and rewrite stage according to its operand's
+  // type
+  if (isSigned) {
+    ty =
+        cir::VectorType::get(builder.getSInt64Ty(), tyPrimitiveSizeInBits / 64);
+    cir::ConstantOp shiftAmt =
+        builder.getConstant(loc, cir::IntAttr::get(builder.getSInt64Ty(), 32));
+    cir::VecSplatOp shiftSplatVecOp =
+        cir::VecSplatOp::create(builder, loc, ty, shiftAmt.getResult());
+    mlir::Value shiftSplatValue = shiftSplatVecOp.getResult();
+    lhs = builder.createBitcast(loc, ops[0], ty);
+    rhs = builder.createBitcast(loc, ops[1], ty);
+    lhs = builder.createShift(loc, lhs, shiftSplatValue, true);
+    lhs = builder.createShift(loc, lhs, shiftSplatValue, false);
+    rhs = builder.createShift(loc, rhs, shiftSplatValue, true);
+    rhs = builder.createShift(loc, rhs, shiftSplatValue, false);
+  } else {
+    ty =
+        cir::VectorType::get(builder.getSInt64Ty(), tyPrimitiveSizeInBits / 64);
+    cir::ConstantOp maskScalar = builder.getConstant(
+        loc, cir::IntAttr::get(builder.getSInt64Ty(), 0xffffffff));
+    cir::VecSplatOp mask =
+        cir::VecSplatOp::create(builder, loc, ty, maskScalar.getResult());
+    lhs = builder.createBitcast(loc, ops[0], ty);
+    rhs = builder.createBitcast(loc, ops[1], ty);
+    lhs = builder.createAnd(loc, lhs, mask);
+    rhs = builder.createAnd(loc, rhs, mask);
+  }
+  return builder.createMul(loc, lhs, rhs);
+}
+
 mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
                                                const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -851,12 +892,18 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_sqrtph512:
   case X86::BI__builtin_ia32_sqrtps512:
   case X86::BI__builtin_ia32_sqrtpd512:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_pmuludq128:
   case X86::BI__builtin_ia32_pmuludq256:
   case X86::BI__builtin_ia32_pmuludq512:
+    return emitX86Muldq(*this, expr, /*IsSigned*/ false, ops);
-    return emitX86Muldq(*this, expr, /*IsSigned*/ false, ops);
+    return emitX86Muldq(builder, getLoc(expr->getExprLoc()), /*isSigned=*/false, ops);
-    return emitX86Muldq(*this, expr, /*IsSigned*/ false, ops);
+    return emitX86Muldq(builder, getLoc(expr->getExprLoc()), /*isSigned=*/false, ops);
   case X86::BI__builtin_ia32_pmuldq128:
   case X86::BI__builtin_ia32_pmuldq256:
   case X86::BI__builtin_ia32_pmuldq512:
+    return emitX86Muldq(*this, expr, /*IsSigned*/ true, ops);
-    return emitX86Muldq(*this, expr, /*IsSigned*/ true, ops);
+    return emitX86Muldq(builder, getLoc(expr->getExprLoc), /*isSigned=*/true, ops);
-    return emitX86Muldq(*this, expr, /*IsSigned*/ true, ops);
+    return emitX86Muldq(builder, getLoc(expr->getExprLoc), /*isSigned=*/true, ops);
   case X86::BI__builtin_ia32_pternlogd512_mask:
   case X86::BI__builtin_ia32_pternlogq512_mask:
   case X86::BI__builtin_ia32_pternlogd128_mask:

diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx2-builtins.c
@@ -0,0 +1,68 @@
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx2 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx2 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx2 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
+
+#include <immintrin.h>
+
+__m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
+// CIR-LABEL: _mm256_mul_epu32
+// CIR: [[MASK_SCALAR:%.*]] = cir.const #cir.int<4294967295> : !s64i
+// CIR: [[MASK_VEC:%.*]] = cir.vec.splat [[MASK_SCALAR]] : !s64i, !cir.vector<4 x !s64i>
+// CIR: [[BC_A:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i>
+// CIR: [[BC_B:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i>
+// CIR: [[AND_A:%.*]] = cir.binop(and, [[BC_A]], [[MASK_VEC]])
+// CIR: [[AND_B:%.*]] = cir.binop(and, [[BC_B]], [[MASK_VEC]])
+// CIR: [[MUL:%.*]]   = cir.binop(mul, [[AND_A]], [[AND_B]])
+
+// LLVM-LABEL: _mm256_mul_epu32
+// LLVM: and <4 x i64> %{{.*}}, splat (i64 4294967295)
+// LLVM: and <4 x i64> %{{.*}}, splat (i64 4294967295)
+// LLVM: mul <4 x i64> %{{.*}}, %{{.*}}
+
+// OGCG-LABEL: _mm256_mul_epu32
+// OGCG: and <4 x i64> %{{.*}}, splat (i64 4294967295)
+// OGCG: and <4 x i64> %{{.*}}, splat (i64 4294967295)
+// OGCG: mul <4 x i64> %{{.*}}, %{{.*}}
+
+return _mm256_mul_epu32(a, b);
+}
+
+__m256i test_mm256_mul_epi32(__m256i a, __m256i b) {
+  // CIR-LABEL: _mm256_mul_epi32
+  // CIR: [[SC:%.*]] = cir.const #cir.int<32> : !s64i
+  // CIR: [[SV:%.*]] = cir.vec.splat [[SC]] : !s64i, !cir.vector<4 x !s64i>
+  // CIR: [[A64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i>
+  // CIR: [[B64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<4 x !s64i>
+  // CIR: [[SHL_A:%.*]]  = cir.shift(left, [[A64]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>)
+  // CIR: [[ASHR_A:%.*]] = cir.shift(right, [[SHL_A]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>)
+  // CIR: [[SHL_B:%.*]]  = cir.shift(left, [[B64]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>)
+  // CIR: [[ASHR_B:%.*]] = cir.shift(right, [[SHL_B]] : !cir.vector<4 x !s64i>, [[SV]] : !cir.vector<4 x !s64i>)
+  // CIR: [[MUL:%.*]]    = cir.binop(mul, [[ASHR_A]], [[ASHR_B]])
+
+  // LLVM-LABEL: _mm256_mul_epi32
+  // LLVM: shl <4 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: ashr <4 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: shl <4 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: ashr <4 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: mul <4 x i64> %{{.*}}, %{{.*}}
+
+  // OGCG-LABEL: _mm256_mul_epi32
+  // OGCG: shl <4 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: ashr <4 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: shl <4 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: ashr <4 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: mul <4 x i64> %{{.*}}, %{{.*}}
+
+  return _mm256_mul_epi32(a, b);
+}
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c
@@ -228,3 +228,58 @@ __mmask16 test_kmov_w(__mmask16 A) {
   // OGCG: bitcast <16 x i1> {{.*}} to i16
   return __builtin_ia32_kmovw(A);
 }
+
+
+__m512i test_mm512_mul_epi32(__m512i __A, __m512i __B) {
+  // CIR-LABEL: _mm512_mul_epi32
+  // CIR: [[SC:%.*]] = cir.const #cir.int<32> : !s64i
+  // CIR: [[SV:%.*]] = cir.vec.splat [[SC]] : !s64i, !cir.vector<8 x !s64i>
+  // CIR: [[A64:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s32i> -> !cir.vector<8 x !s64i>
+  // CIR: [[B64:%.*]] = cir.cast bitcast %{{.*}} : !cir.vector<16 x !s32i> -> !cir.vector<8 x !s64i>
+  // CIR: [[SHL_A:%.*]]  = cir.shift(left, [[A64]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>)
+  // CIR: [[ASHR_A:%.*]] = cir.shift(right, [[SHL_A]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>)
+  // CIR: [[SHL_B:%.*]]  = cir.shift(left, [[B64]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>)
+  // CIR: [[ASHR_B:%.*]] = cir.shift(right, [[SHL_B]] : !cir.vector<8 x !s64i>, [[SV]] : !cir.vector<8 x !s64i>)
+  // CIR: [[MUL:%.*]]    = cir.binop(mul, [[ASHR_A]], [[ASHR_B]])
+
+  // LLVM-LABEL: _mm512_mul_epi32
+  // LLVM: shl <8 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: ashr <8 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: shl <8 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: ashr <8 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: mul <8 x i64> %{{.*}}, %{{.*}}
+
+  // OGCG-LABEL: _mm512_mul_epi32
+  // OGCG: shl <8 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: ashr <8 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: shl <8 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: ashr <8 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: mul <8 x i64> %{{.*}}, %{{.*}}
+
+  return _mm512_mul_epi32(__A, __B);
+}
+
+
+__m512i test_mm512_mul_epu32(__m512i __A, __m512i __B) {
+// CIR-LABEL: _mm512_mul_epu32
+// CIR: [[MASK_SCALAR:%.*]] = cir.const #cir.int<4294967295> : !s64i
+// CIR: [[MASK_VEC:%.*]] = cir.vec.splat [[MASK_SCALAR]] : !s64i, !cir.vector<8 x !s64i>
+// CIR: [[BC_A:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<8 x !s64i>
+// CIR: [[BC_B:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<8 x !s64i>
+// CIR: [[AND_A:%.*]] = cir.binop(and, [[BC_A]], [[MASK_VEC]])
+// CIR: [[AND_B:%.*]] = cir.binop(and, [[BC_B]], [[MASK_VEC]])
+// CIR: [[MUL:%.*]]   = cir.binop(mul, [[AND_A]], [[AND_B]])
+
+// LLVM-LABEL: _mm512_mul_epu32
+// LLVM: and <8 x i64> %{{.*}}, splat (i64 4294967295)
+// LLVM: and <8 x i64> %{{.*}}, splat (i64 4294967295)
+// LLVM: mul <8 x i64> %{{.*}}, %{{.*}}
+
+// OGCG-LABEL: _mm512_mul_epu32
+// OGCG: and <8 x i64> %{{.*}}, splat (i64 4294967295)
+// OGCG: and <8 x i64> %{{.*}}, splat (i64 4294967295)
+// OGCG: mul <8 x i64> %{{.*}}, %{{.*}}
+
+return _mm512_mul_epu32(__A, __B);
+}
+
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/sse2-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/sse2-builtins.c
@@ -108,3 +108,26 @@ void test_mm_pause(void) {
   // LLVM: call void @llvm.x86.sse2.pause()
   // OGCG: call void @llvm.x86.sse2.pause()
 }
+
+__m128i test_mm_mul_epu32(__m128i A, __m128i B) {
+// CIR-LABEL: _mm_mul_epu32
+// CIR: [[MASK_SCALAR:%.*]] = cir.const #cir.int<4294967295> : !s64i
+// CIR: [[MASK_VEC:%.*]] = cir.vec.splat [[MASK_SCALAR]] : !s64i, !cir.vector<2 x !s64i>
+// CIR: [[BC_A:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i>
+// CIR: [[BC_B:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i>
+// CIR: [[AND_A:%.*]] = cir.binop(and, [[BC_A]], [[MASK_VEC]])
+// CIR: [[AND_B:%.*]] = cir.binop(and, [[BC_B]], [[MASK_VEC]])
+// CIR: [[MUL:%.*]]   = cir.binop(mul, [[AND_A]], [[AND_B]])
+
+// LLVM-LABEL: _mm_mul_epu32
+// LLVM: and <2 x i64> %{{.*}}, splat (i64 4294967295)
+// LLVM: and <2 x i64> %{{.*}}, splat (i64 4294967295)
+// LLVM: mul <2 x i64> %{{.*}}, %{{.*}}
+
+// OGCG-LABEL: _mm_mul_epu32
+// OGCG: and <2 x i64> %{{.*}}, splat (i64 4294967295)
+// OGCG: and <2 x i64> %{{.*}}, splat (i64 4294967295)
+// OGCG: mul <2 x i64> %{{.*}}, %{{.*}}
+
+return _mm_mul_epu32(A, B);
+}
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/sse41-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/sse41-builtins.c
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +sse4.1 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s --check-prefixes=OGCG
+
+#include <immintrin.h>
+
+__m128i test_mm_mul_epi32(__m128i x, __m128i y) {
+  // CIR-LABEL: _mm_mul_epi32
+  // CIR: [[SC:%.*]] = cir.const #cir.int<32> : !s64i
+  // CIR: [[SV:%.*]] = cir.vec.splat [[SC]] : !s64i, !cir.vector<2 x !s64i>
+  // CIR: [[A64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i>
+  // CIR: [[B64:%.*]] = cir.cast bitcast %{{.*}} : {{.*}} -> !cir.vector<2 x !s64i>
+  // CIR: [[SHL_A:%.*]]  = cir.shift(left, [[A64]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>)
+  // CIR: [[ASHR_A:%.*]] = cir.shift(right, [[SHL_A]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>)
+  // CIR: [[SHL_B:%.*]]  = cir.shift(left, [[B64]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>)
+  // CIR: [[ASHR_B:%.*]] = cir.shift(right, [[SHL_B]] : !cir.vector<2 x !s64i>, [[SV]] : !cir.vector<2 x !s64i>)
+  // CIR: [[MUL:%.*]]    = cir.binop(mul, [[ASHR_A]], [[ASHR_B]])
+
+  // LLVM-LABEL: _mm_mul_epi32
+  // LLVM: shl <2 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: ashr <2 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: shl <2 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: ashr <2 x i64> %{{.*}}, splat (i64 32)
+  // LLVM: mul <2 x i64> %{{.*}}, %{{.*}}
+
+  // OGCG-LABEL: _mm_mul_epi32
+  // OGCG: shl <2 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: ashr <2 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: shl <2 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: ashr <2 x i64> %{{.*}}, splat (i64 32)
+  // OGCG: mul <2 x i64> %{{.*}}, %{{.*}}
+
+  return _mm_mul_epi32(x, y);
+}