diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 0e43345bad6f1..b242efc00e491 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -85,6 +85,36 @@ static mlir::Value getMaskVecValue(CIRGenBuilderTy &builder, mlir::Location loc,
   return maskVec;
 }
 
+static mlir::Value emitX86MaskAddLogic(CIRGenBuilderTy &builder,
+                                       mlir::Location loc,
+                                       const std::string &intrinsicName,
+                                       SmallVectorImpl<mlir::Value> &ops) {
+
+  auto intTy = cast<cir::IntType>(ops[0].getType());
+  unsigned numElts = intTy.getWidth();
+  mlir::Value lhsVec = getMaskVecValue(builder, loc, ops[0], numElts);
+  mlir::Value rhsVec = getMaskVecValue(builder, loc, ops[1], numElts);
+  mlir::Type vecTy = lhsVec.getType();
+  mlir::Value resVec = emitIntrinsicCallOp(builder, loc, intrinsicName, vecTy,
+                                           mlir::ValueRange{lhsVec, rhsVec});
+  return builder.createBitcast(resVec, ops[0].getType());
+}
+
+static mlir::Value emitX86MaskLogic(CIRGenBuilderTy &builder,
+                                    mlir::Location loc,
+                                    cir::BinOpKind binOpKind,
+                                    SmallVectorImpl<mlir::Value> &ops,
+                                    bool invertLHS = false) {
+  unsigned numElts = cast<cir::IntType>(ops[0].getType()).getWidth();
+  mlir::Value lhs = getMaskVecValue(builder, loc, ops[0], numElts);
+  mlir::Value rhs = getMaskVecValue(builder, loc, ops[1], numElts);
+
+  if (invertLHS)
+    lhs = builder.createNot(lhs);
+  return builder.createBitcast(builder.createBinop(loc, lhs, binOpKind, rhs),
+                               ops[0].getType());
+}
+
 mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
                                                const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -743,38 +773,75 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_ktestzsi:
   case X86::BI__builtin_ia32_ktestcdi:
   case X86::BI__builtin_ia32_ktestzdi:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_kaddqi:
+    return emitX86MaskAddLogic(builder, getLoc(expr->getExprLoc()),
+                               "x86.avx512.kadd.b", ops);
   case X86::BI__builtin_ia32_kaddhi:
+    return emitX86MaskAddLogic(builder, getLoc(expr->getExprLoc()),
+                               "x86.avx512.kadd.w", ops);
   case X86::BI__builtin_ia32_kaddsi:
+    return emitX86MaskAddLogic(builder, getLoc(expr->getExprLoc()),
+                               "x86.avx512.kadd.d", ops);
   case X86::BI__builtin_ia32_kadddi:
+    return emitX86MaskAddLogic(builder, getLoc(expr->getExprLoc()),
+                               "x86.avx512.kadd.q", ops);
   case X86::BI__builtin_ia32_kandqi:
   case X86::BI__builtin_ia32_kandhi:
   case X86::BI__builtin_ia32_kandsi:
   case X86::BI__builtin_ia32_kanddi:
+    return emitX86MaskLogic(builder, getLoc(expr->getExprLoc()),
+                            cir::BinOpKind::And, ops);
   case X86::BI__builtin_ia32_kandnqi:
   case X86::BI__builtin_ia32_kandnhi:
   case X86::BI__builtin_ia32_kandnsi:
   case X86::BI__builtin_ia32_kandndi:
+    return emitX86MaskLogic(builder, getLoc(expr->getExprLoc()),
+                            cir::BinOpKind::And, ops, true);
   case X86::BI__builtin_ia32_korqi:
   case X86::BI__builtin_ia32_korhi:
   case X86::BI__builtin_ia32_korsi:
   case X86::BI__builtin_ia32_kordi:
+    return emitX86MaskLogic(builder, getLoc(expr->getExprLoc()),
+                            cir::BinOpKind::Or, ops);
   case X86::BI__builtin_ia32_kxnorqi:
   case X86::BI__builtin_ia32_kxnorhi:
   case X86::BI__builtin_ia32_kxnorsi:
   case X86::BI__builtin_ia32_kxnordi:
+    return emitX86MaskLogic(builder, getLoc(expr->getExprLoc()),
+                            cir::BinOpKind::Xor, ops, true);
   case X86::BI__builtin_ia32_kxorqi:
   case X86::BI__builtin_ia32_kxorhi:
   case X86::BI__builtin_ia32_kxorsi:
   case X86::BI__builtin_ia32_kxordi:
+    return emitX86MaskLogic(builder, getLoc(expr->getExprLoc()),
+                            cir::BinOpKind::Xor, ops);
   case X86::BI__builtin_ia32_knotqi:
   case X86::BI__builtin_ia32_knothi:
   case X86::BI__builtin_ia32_knotsi:
-  case X86::BI__builtin_ia32_knotdi:
+  case X86::BI__builtin_ia32_knotdi: {
+    cir::IntType intTy = cast<cir::IntType>(ops[0].getType());
+    unsigned numElts = intTy.getWidth();
+    mlir::Value resVec =
+        getMaskVecValue(builder, getLoc(expr->getExprLoc()), ops[0], numElts);
+    return builder.createBitcast(builder.createNot(resVec), ops[0].getType());
+  }
   case X86::BI__builtin_ia32_kmovb:
   case X86::BI__builtin_ia32_kmovw:
   case X86::BI__builtin_ia32_kmovd:
-  case X86::BI__builtin_ia32_kmovq:
+  case X86::BI__builtin_ia32_kmovq: {
+    // Bitcast to vXi1 type and then back to integer. This gets the mask
+    // register type into the IR, but might be optimized out depending on
+    // what's around it.
+    cir::IntType intTy = cast<cir::IntType>(ops[0].getType());
+    unsigned numElts = intTy.getWidth();
+    mlir::Value resVec =
+        getMaskVecValue(builder, getLoc(expr->getExprLoc()), ops[0], numElts);
+    return builder.createBitcast(resVec, ops[0].getType());
+  }
   case X86::BI__builtin_ia32_kunpckdi:
   case X86::BI__builtin_ia32_kunpcksi:
   case X86::BI__builtin_ia32_kunpckhi:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512bw-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512bw-builtins.c
index 3522e2c7e50bf..4863ba0bd8848 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx512bw-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512bw-builtins.c
@@ -115,3 +115,353 @@ __mmask32 test_kshiftri_mask32_out_of_range(__mmask32 A) {
 
   return _kshiftri_mask32(A, 33);
 }
+
+
+__mmask32 test_kadd_mask32(__mmask32 A, __mmask32 B) {
+  // CIR-LABEL: _kadd_mask32
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.call_llvm_intrinsic "x86.avx512.kadd.d"
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int<u, 1>> -> !u32i
+
+  // LLVM-LABEL: _kadd_mask32
+  // LLVM: [[L:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: [[R:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: [[RES:%.*]] = call <32 x i1> @llvm.x86.avx512.kadd.d(<32 x i1> [[L]], <32 x i1> [[R]])
+  // LLVM: bitcast <32 x i1> [[RES]] to i32
+
+  // OGCG-LABEL: _kadd_mask32
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: call <32 x i1> @llvm.x86.avx512.kadd.d
+  // OGCG: bitcast <32 x i1> {{.*}} to i32
+  return _kadd_mask32(A, B);
+}
+
+__mmask64 test_kadd_mask64(__mmask64 A, __mmask64 B) {
+  // CIR-LABEL: _kadd_mask64
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.call_llvm_intrinsic "x86.avx512.kadd.q"
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int<u, 1>> -> !u64i
+
+  // LLVM-LABEL: _kadd_mask64
+  // LLVM: [[L:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: [[R:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: [[RES:%.*]] = call <64 x i1> @llvm.x86.avx512.kadd.q(<64 x i1> [[L]], <64 x i1> [[R]])
+  // LLVM: bitcast <64 x i1> [[RES]] to i64
+
+  // OGCG-LABEL: _kadd_mask64
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: call <64 x i1> @llvm.x86.avx512.kadd.q
+  // OGCG: bitcast <64 x i1> {{.*}} to i64
+  return _kadd_mask64(A, B);
+}
+
+__mmask32 test_kand_mask32(__mmask32 A, __mmask32 B) {
+  // CIR-LABEL: _kand_mask32
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{.*}}, {{.*}}) : !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int<u, 1>> -> !u32i
+
+  // LLVM-LABEL: _kand_mask32
+  // LLVM: [[L:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: [[R:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: [[RES:%.*]] = and <32 x i1> [[L]], [[R]]
+  // LLVM: bitcast <32 x i1> [[RES]] to i32
+
+  // OGCG-LABEL: _kand_mask32
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: and <32 x i1>
+  // OGCG: bitcast <32 x i1> {{.*}} to i32
+  return _kand_mask32(A, B);
+}
+
+__mmask64 test_kand_mask64(__mmask64 A, __mmask64 B) {
+  // CIR-LABEL: _kand_mask64
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{.*}}, {{.*}}) : !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int<u, 1>> -> !u64i
+
+  // LLVM-LABEL: _kand_mask64
+  // LLVM: [[L:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: [[R:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: [[RES:%.*]] = and <64 x i1> [[L]], [[R]]
+  // LLVM: bitcast <64 x i1> [[RES]] to i64
+
+  // OGCG-LABEL: _kand_mask64
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: and <64 x i1>
+  // OGCG: bitcast <64 x i1> {{.*}} to i64
+  return _kand_mask64(A, B);
+}
+
+__mmask32 test_kandn_mask32(__mmask32 A, __mmask32 B) {
+  // CIR-LABEL: _kandn_mask32
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.unary(not, {{.*}}) : !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{.*}}, {{.*}}) : !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int<u, 1>> -> !u32i
+
+  // LLVM-LABEL: _kandn_mask32
+  // LLVM: [[L:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: [[R:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: xor <32 x i1> [[L]], splat (i1 true)
+  // LLVM: and <32 x i1>
+  // LLVM: bitcast <32 x i1> {{.*}} to i32
+
+  // OGCG-LABEL: _kandn_mask32
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: xor <32 x i1>
+  // OGCG: and <32 x i1>
+  // OGCG: bitcast <32 x i1> {{.*}} to i32
+  return _kandn_mask32(A, B);
+}
+
+__mmask64 test_kandn_mask64(__mmask64 A, __mmask64 B) {
+  // CIR-LABEL: _kandn_mask64
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.unary(not, {{.*}}) : !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{.*}}, {{.*}}) : !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int<u, 1>> -> !u64i
+
+  // LLVM-LABEL: _kandn_mask64
+  // LLVM: [[L:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: [[R:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: xor <64 x i1> [[L]], splat (i1 true)
+  // LLVM: and <64 x i1>
+  // LLVM: bitcast <64 x i1> {{.*}} to i64
+
+  // OGCG-LABEL: _kandn_mask64
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: xor <64 x i1>
+  // OGCG: and <64 x i1>
+  // OGCG: bitcast <64 x i1> {{.*}} to i64
+  return _kandn_mask64(A, B);
+}
+
+__mmask32 test_kor_mask32(__mmask32 A, __mmask32 B) {
+  // CIR-LABEL: _kor_mask32
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.binop(or, {{.*}}, {{.*}}) : !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int<u, 1>> -> !u32i
+
+  // LLVM-LABEL: _kor_mask32
+  // LLVM: [[L:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: [[R:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: or <32 x i1> [[L]], [[R]]
+  // LLVM: bitcast <32 x i1> {{.*}} to i32
+
+  // OGCG-LABEL: _kor_mask32
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: or <32 x i1>
+  // OGCG: bitcast <32 x i1> {{.*}} to i32
+  return _kor_mask32(A, B);
+}
+
+__mmask64 test_kor_mask64(__mmask64 A, __mmask64 B) {
+  // CIR-LABEL: _kor_mask64
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.binop(or, {{.*}}, {{.*}}) : !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int<u, 1>> -> !u64i
+
+  // LLVM-LABEL: _kor_mask64
+  // LLVM: [[L:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: [[R:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: or <64 x i1> [[L]], [[R]]
+  // LLVM: bitcast <64 x i1> {{.*}} to i64
+
+  // OGCG-LABEL: _kor_mask64
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: or <64 x i1>
+  // OGCG: bitcast <64 x i1> {{.*}} to i64
+  return _kor_mask64(A, B);
+}
+
+__mmask32 test_kxor_mask32(__mmask32 A, __mmask32 B) {
+  // CIR-LABEL: _kxor_mask32
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.binop(xor, {{.*}}, {{.*}}) : !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int<u, 1>> -> !u32i
+
+  // LLVM-LABEL: _kxor_mask32
+  // LLVM: [[L:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: [[R:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: xor <32 x i1> [[L]], [[R]]
+  // LLVM: bitcast <32 x i1> {{.*}} to i32
+
+  // OGCG-LABEL: _kxor_mask32
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: xor <32 x i1>
+  // OGCG: bitcast <32 x i1> {{.*}} to i32
+  return _kxor_mask32(A, B);
+}
+
+__mmask64 test_kxor_mask64(__mmask64 A, __mmask64 B) {
+  // CIR-LABEL: _kxor_mask64
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.binop(xor, {{.*}}, {{.*}}) : !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int<u, 1>> -> !u64i
+
+  // LLVM-LABEL: _kxor_mask64
+  // LLVM: [[L:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: [[R:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: xor <64 x i1> [[L]], [[R]]
+  // LLVM: bitcast <64 x i1> {{.*}} to i64
+
+  // OGCG-LABEL: _kxor_mask64
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: xor <64 x i1>
+  // OGCG: bitcast <64 x i1> {{.*}} to i64
+  return _kxor_mask64(A, B);
+}
+
+__mmask32 test_kxnor_mask32(__mmask32 A, __mmask32 B) {
+  // CIR-LABEL: _kxnor_mask32
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.unary(not, {{.*}}) : !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.binop(xor, {{.*}}, {{.*}}) : !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int<u, 1>> -> !u32i
+
+  // LLVM-LABEL: _kxnor_mask32
+  // LLVM: [[L:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: [[R:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: [[NOT:%.*]] = xor <32 x i1> [[L]], splat (i1 true)
+  // LLVM: [[RES:%.*]] = xor <32 x i1> [[NOT]], [[R]]
+  // LLVM: bitcast <32 x i1> [[RES]] to i32
+
+  // OGCG-LABEL: _kxnor_mask32
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: xor <32 x i1>
+  // OGCG: xor <32 x i1>
+  // OGCG: bitcast <32 x i1> {{.*}} to i32
+
+  return _kxnor_mask32(A, B);
+}
+
+__mmask64 test_kxnor_mask64(__mmask64 A, __mmask64 B) {
+  // CIR-LABEL: _kxnor_mask64
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.unary(not, {{.*}}) : !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.binop(xor, {{.*}}, {{.*}}) : !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int<u, 1>> -> !u64i
+
+  // LLVM-LABEL: _kxnor_mask64
+  // LLVM: [[L:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: [[R:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: [[NOT:%.*]] = xor <64 x i1> [[L]], splat (i1 true)
+  // LLVM: [[RES:%.*]] = xor <64 x i1> [[NOT]], [[R]]
+  // LLVM: bitcast <64 x i1> [[RES]] to i64
+
+  // OGCG-LABEL: _kxnor_mask64
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: xor <64 x i1>
+  // OGCG: xor <64 x i1>
+  // OGCG: bitcast <64 x i1> {{.*}} to i64
+
+  return _kxnor_mask64(A, B);
+}
+
+
+__mmask32 test_knot_mask32(__mmask32 A) {
+  // CIR-LABEL: _knot_mask32
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.unary(not, {{.*}}) : !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int<u, 1>> -> !u32i
+
+  // LLVM-LABEL: _knot_mask32
+  // LLVM: bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: xor <32 x i1>
+  // LLVM: bitcast <32 x i1> {{.*}} to i32
+
+  // OGCG-LABEL: _knot_mask32
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: xor <32 x i1>
+  // OGCG: bitcast <32 x i1> {{.*}} to i32
+  return _knot_mask32(A);
+}
+
+__mmask64 test_knot_mask64(__mmask64 A) {
+  // CIR-LABEL: _knot_mask64
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.unary(not, {{.*}}) : !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int<u, 1>> -> !u64i
+
+  // LLVM-LABEL: _knot_mask64
+  // LLVM: bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: xor <64 x i1>
+  // LLVM: bitcast <64 x i1> {{.*}} to i64
+
+  // OGCG-LABEL: _knot_mask64
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: xor <64 x i1>
+  // OGCG: bitcast <64 x i1> {{.*}} to i64
+  return _knot_mask64(A);
+}
+
+// Multiple user-level mask helpers inline to this same kmov builtin.
+// CIR does not implement any special lowering for those helpers.
+//
+// Therefore, testing the builtin (__builtin_ia32_kmov*) directly is
+// sufficient to cover the CIR lowering behavior. Testing each helper
+// individually would add no new CIR paths.
+
+__mmask32 test_kmov_d(__mmask32 A) {
+  // CIR-LABEL: test_kmov_d
+  // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int<u, 1>> -> !u32i
+
+  // LLVM-LABEL: test_kmov_d
+  // LLVM: bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: bitcast <32 x i1> {{.*}} to i32
+
+  // OGCG-LABEL: test_kmov_d
+  // OGCG: bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: bitcast <32 x i1> {{.*}} to i32
+
+  return __builtin_ia32_kmovd(A);
+}
+
+// Multiple user-level mask helpers inline to this same kmov builtin.
+// CIR does not implement any special lowering for those helpers.
+//
+// Therefore, testing the builtin (__builtin_ia32_kmov*) directly is
+// sufficient to cover the CIR lowering behavior. Testing each helper
+// individually would add no new CIR paths.
+
+__mmask64 test_kmov_q(__mmask64 A) {
+  // CIR-LABEL: test_kmov_q
+  // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int<u, 1>> -> !u64i
+
+  // LLVM-LABEL: test_kmov_q
+  // LLVM: bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: bitcast <64 x i1> {{.*}} to i64
+
+  // OGCG-LABEL: test_kmov_q
+  // OGCG: bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: bitcast <64 x i1> {{.*}} to i64
+
+  return __builtin_ia32_kmovq(A);
+}
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512dq-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512dq-builtins.c
new file mode 100644
index 0000000000000..5d81f666271be
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512dq-builtins.c
@@ -0,0 +1,210 @@
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512dq -fclangir -emit-cir -o %t.cir -Wall -Werror
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512dq -fclangir -emit-llvm -o %t.ll -Wall -Werror
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512dq -fno-signed-char -fclangir -emit-cir -o %t.cir -Wall -Werror
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512dq -fno-signed-char -fclangir -emit-llvm -o %t.ll -Wall -Werror
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512dq -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512dq -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+
+#include <immintrin.h>
+
+__mmask8 test_kadd_mask8(__mmask8 A, __mmask8 B) {
+ // CIR-LABEL: _kadd_mask8
+ // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.call_llvm_intrinsic "x86.avx512.kadd.b"
+ // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+
+ // LLVM-LABEL: _kadd_mask8
+ // LLVM: [[L:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+ // LLVM: [[R:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+ // LLVM: [[RES:%.*]] = call <8 x i1> @llvm.x86.avx512.kadd.b(<8 x i1> [[L]], <8 x i1> [[R]])
+ // LLVM: bitcast <8 x i1> [[RES]] to i8
+
+ // OGCG-LABEL: _kadd_mask8
+ // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+ // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+ // OGCG: call <8 x i1> @llvm.x86.avx512.kadd.b
+ // OGCG: bitcast <8 x i1> {{.*}} to i8
+ return _kadd_mask8(A, B);
+}
+
+__mmask16 test_kadd_mask16(__mmask16 A, __mmask16 B) {
+ // CIR-LABEL: _kadd_mask16
+ // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+ // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+ // CIR: cir.call_llvm_intrinsic "x86.avx512.kadd.w"
+ // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+
+ // LLVM-LABEL: _kadd_mask16
+ // LLVM: [[L:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+ // LLVM: [[R:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+ // LLVM: [[RES:%.*]] = call <16 x i1> @llvm.x86.avx512.kadd.w(<16 x i1> [[L]], <16 x i1> [[R]])
+ // LLVM: bitcast <16 x i1> [[RES]] to i16
+
+ // OGCG-LABEL: _kadd_mask16
+ // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+ // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+ // OGCG: call <16 x i1> @llvm.x86.avx512.kadd.w
+ // OGCG: bitcast <16 x i1> {{.*}} to i16
+ return _kadd_mask16(A, B);
+}
+
+__mmask8 test_kand_mask8(__mmask8 A, __mmask8 B) {
+ // CIR-LABEL: _kand_mask8
+ // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.binop(and, {{.*}}, {{.*}}) : !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+
+ // LLVM-LABEL: _kand_mask8
+ // LLVM: [[L:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+ // LLVM: [[R:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+ // LLVM: [[RES:%.*]] = and <8 x i1> [[L]], [[R]]
+ // LLVM: bitcast <8 x i1> [[RES]] to i8
+
+ // OGCG-LABEL: _kand_mask8
+ // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+ // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+ // OGCG: and <8 x i1>
+ // OGCG: bitcast <8 x i1> {{.*}} to i8
+ return _kand_mask8(A, B);
+}
+
+
+__mmask8 test_kandn_mask8(__mmask8 A, __mmask8 B) {
+ // CIR-LABEL: _kandn_mask8
+ // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.unary(not, {{.*}}) : !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.binop(and, {{.*}}, {{.*}}) : !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+
+ // LLVM-LABEL: _kandn_mask8
+ // LLVM: [[L:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+ // LLVM: [[R:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+ // LLVM: xor <8 x i1> [[L]], splat (i1 true)
+ // LLVM: and <8 x i1>
+ // LLVM: bitcast <8 x i1> {{.*}} to i8
+
+ // OGCG-LABEL: _kandn_mask8
+ // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+ // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+ // OGCG: xor <8 x i1>
+ // OGCG: and <8 x i1>
+ // OGCG: bitcast <8 x i1> {{.*}} to i8
+
+ return _kandn_mask8(A, B);
+}
+
+__mmask8 test_kor_mask8(__mmask8 A, __mmask8 B) {
+ // CIR-LABEL: _kor_mask8
+ // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.binop(or, {{.*}}, {{.*}}) : !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+
+ // LLVM-LABEL: _kor_mask8
+ // LLVM: [[L:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+ // LLVM: [[R:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+ // LLVM: or <8 x i1> [[L]], [[R]]
+ // LLVM: bitcast <8 x i1> {{.*}} to i8
+
+ // OGCG-LABEL: _kor_mask8
+ // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+ // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+ // OGCG: or <8 x i1>
+ // OGCG: bitcast <8 x i1> {{.*}} to i8
+ return _kor_mask8(A, B);
+}
+
+__mmask8 test_kxor_mask8(__mmask8 A, __mmask8 B) {
+ // CIR-LABEL: _kxor_mask8
+ // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.binop(xor, {{.*}}, {{.*}}) : !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+
+ // LLVM-LABEL: _kxor_mask8
+ // LLVM: [[L:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+ // LLVM: [[R:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+ // LLVM: xor <8 x i1> [[L]], [[R]]
+ // LLVM: bitcast <8 x i1> {{.*}} to i8
+
+ // OGCG-LABEL: _kxor_mask8
+ // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+ // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+ // OGCG: xor <8 x i1>
+ // OGCG: bitcast <8 x i1> {{.*}} to i8
+ return _kxor_mask8(A, B);
+}
+
+__mmask8 test_kxnor_mask8(__mmask8 A, __mmask8 B) {
+ // CIR-LABEL: _kxnor_mask8
+ // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.unary(not, {{.*}}) : !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.binop(xor, {{.*}}, {{.*}}) : !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+
+ // LLVM-LABEL: _kxnor_mask8
+ // LLVM: [[L:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+ // LLVM: [[R:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+ // LLVM: [[NOT:%.*]] = xor <8 x i1> [[L]], splat (i1 true)
+ // LLVM: [[RES:%.*]] = xor <8 x i1> [[NOT]], [[R]]
+ // LLVM: bitcast <8 x i1> [[RES]] to i8
+
+ // OGCG-LABEL: _kxnor_mask8
+ // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+ // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+ // OGCG: xor <8 x i1>
+ // OGCG: xor <8 x i1>
+ // OGCG: bitcast <8 x i1> {{.*}} to i8
+ return _kxnor_mask8(A, B);
+}
+
+
+__mmask8 test_knot_mask8(__mmask8 A) {
+ // CIR-LABEL: _knot_mask8
+ // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.unary(not, {{.*}}) : !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+
+ // LLVM-LABEL: _knot_mask8
+ // LLVM: [[L:%.*]] = bitcast i8 %{{.*}} to <8 x i1>
+ // LLVM: xor <8 x i1> [[L]], {{.*}}
+ // LLVM: bitcast <8 x i1> {{.*}} to i8
+
+ // OGCG-LABEL: _knot_mask8
+ // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+ // OGCG: xor <8 x i1>
+ // OGCG: bitcast <8 x i1> {{.*}} to i8
+ return _knot_mask8(A);
+}
+
+// Multiple user-level mask helpers inline to this same kmov builtin.
+// CIR does not implement any special lowering for those helpers.
+//
+// Therefore, testing the builtin (__builtin_ia32_kmov*) directly is
+// sufficient to cover the CIR lowering behavior. Testing each helper
+// individually would add no new CIR paths.
+
+__mmask8 test_kmov_b(__mmask8 A) {
+ // CIR-LABEL: test_kmov_b
+ // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+ // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<u, 1>> -> !u8i
+
+ // LLVM-LABEL: test_kmov_b
+ // LLVM: bitcast i8 %{{.*}} to <8 x i1>
+ // LLVM: bitcast <8 x i1> {{.*}} to i8
+
+ // OGCG-LABEL: test_kmov_b
+ // OGCG: bitcast i8 %{{.*}} to <8 x i1>
+ // OGCG: bitcast <8 x i1> {{.*}} to i8
+ return __builtin_ia32_kmovb(A);
+}
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c
index dc54a87856a7c..31d6bc3d22408 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c
@@ -77,3 +77,154 @@ __m512i test_mm512_undefined_epi32(void) {
   // OGCG: ret <8 x i64> zeroinitializer
   return _mm512_undefined_epi32();
 }
+
+__mmask16 test_mm512_kand(__mmask16 A, __mmask16 B) {
+  // CIR-LABEL: _mm512_kand
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{.*}}, {{.*}}) : !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+
+  // LLVM-LABEL: _mm512_kand
+  // LLVM: [[L:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: [[R:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: [[RES:%.*]] = and <16 x i1> [[L]], [[R]]
+  // LLVM: bitcast <16 x i1> [[RES]] to i16
+
+  // OGCG-LABEL: _mm512_kand
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: and <16 x i1>
+  // OGCG: bitcast <16 x i1> {{.*}} to i16
+  return _mm512_kand(A, B);
+}
+
+__mmask16 test_mm512_kandn(__mmask16 A, __mmask16 B) {
+  // CIR-LABEL: _mm512_kandn
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.unary(not, {{.*}}) : !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.binop(and, {{.*}}, {{.*}}) : !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+
+  // LLVM-LABEL: _mm512_kandn
+  // LLVM: [[L:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: [[R:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: xor <16 x i1> [[L]], splat (i1 true)
+  // LLVM: and <16 x i1>
+  // LLVM: bitcast <16 x i1> {{.*}} to i16
+
+  // OGCG-LABEL: _mm512_kandn
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: xor <16 x i1>
+  // OGCG: and <16 x i1>
+  // OGCG: bitcast <16 x i1> {{.*}} to i16
+  return _mm512_kandn(A, B);
+}
+
+__mmask16 test_mm512_kor(__mmask16 A, __mmask16 B) {
+  // CIR-LABEL: _mm512_kor
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.binop(or, {{.*}}, {{.*}}) : !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+
+  // LLVM-LABEL: _mm512_kor
+  // LLVM: [[L:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: [[R:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: or <16 x i1> [[L]], [[R]]
+  // LLVM: bitcast <16 x i1> {{.*}} to i16
+
+  // OGCG-LABEL: _mm512_kor
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: or <16 x i1>
+  // OGCG: bitcast <16 x i1> {{.*}} to i16
+  return _mm512_kor(A, B);
+}
+
+__mmask16 test_mm512_kxnor(__mmask16 A, __mmask16 B) {
+  // CIR-LABEL: _mm512_kxnor
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.unary(not, {{.*}}) : !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.binop(xor, {{.*}}, {{.*}}) : !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+
+  // LLVM-LABEL: _mm512_kxnor
+  // LLVM: [[L:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: [[R:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: [[NOT:%.*]] = xor <16 x i1> [[L]], splat (i1 true)
+  // LLVM: [[RES:%.*]] = xor <16 x i1> [[NOT]], [[R]]
+  // LLVM: bitcast <16 x i1> [[RES]] to i16
+
+  // OGCG-LABEL: _mm512_kxnor
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: xor <16 x i1>
+  // OGCG: xor <16 x i1>
+  // OGCG: bitcast <16 x i1> {{.*}} to i16
+  return _mm512_kxnor(A, B);
+}
+
+__mmask16 test_mm512_kxor(__mmask16 A, __mmask16 B) {
+  // CIR-LABEL: _mm512_kxor
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.binop(xor, {{.*}}, {{.*}}) : !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+
+  // LLVM-LABEL: _mm512_kxor
+  // LLVM: [[L:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: [[R:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: xor <16 x i1> [[L]], [[R]]
+  // LLVM: bitcast <16 x i1> {{.*}} to i16
+
+  // OGCG-LABEL: _mm512_kxor
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: xor <16 x i1>
+  // OGCG: bitcast <16 x i1> {{.*}} to i16
+  return _mm512_kxor(A, B);
+}
+
+__mmask16 test_mm512_knot(__mmask16 A) {
+  // CIR-LABEL: _mm512_knot
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.unary(not, {{.*}}) : !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+
+  // LLVM-LABEL: _mm512_knot
+  // LLVM: bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: xor <16 x i1>
+  // LLVM: bitcast <16 x i1> {{.*}} to i16
+
+  // OGCG-LABEL: _mm512_knot
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: xor <16 x i1>
+  // OGCG: bitcast <16 x i1> {{.*}} to i16
+  return _mm512_knot(A);
+}
+
+// Multiple user-level mask helpers inline to this same kmov builtin.
+// CIR does not implement any special lowering for those helpers.
+//
+// Therefore, testing the builtin (__builtin_ia32_kmov*) directly is
+// sufficient to cover the CIR lowering behavior. Testing each helper
+// individually would add no new CIR paths.
+
+__mmask16 test_kmov_w(__mmask16 A) {
+  // CIR-LABEL: test_kmov_w
+  // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int<u, 1>> -> !u16i
+
+  // LLVM-LABEL: test_kmov_w
+  // LLVM: bitcast i16 %{{.*}} to <16 x i1>
+  // LLVM: bitcast <16 x i1> {{.*}} to i16
+
+  // OGCG-LABEL: test_kmov_w
+  // OGCG: bitcast i16 %{{.*}} to <16 x i1>
+  // OGCG: bitcast <16 x i1> {{.*}} to i16
+  return __builtin_ia32_kmovw(A);
+}