[X86] Add SM4 instructions.

For more details about these instructions, please refer to the latest ISE document: https://www.intel.com/content/www/us/en/develop/download/intel-architecture-instruction-set-extensions-programming-reference.html Reviewed By: pengfei, skan Differential Revision: https://reviews.llvm.org/D155148
llvm · Jul 20, 2023 · 049d6a3 · 049d6a3
1 parent 75d7180
commit 049d6a3
Show file tree

Hide file tree

Showing 28 changed files with 1,129 additions and 3 deletions.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
@@ -821,6 +821,9 @@ X86 Support
   * Support intrinsic of ``_mm_sm3msg1_epi32``.
   * Support intrinsic of ``_mm_sm3msg2_epi32``.
   * Support intrinsic of ``_mm_sm3rnds2_epi32``.
+- Support ISA of ``SM4``.
+  * Support intrinsic of ``_mm(256)_sm4key4_epi32``.
+  * Support intrinsic of ``_mm(256)_sm4rnds4_epi32``.
 
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^

diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
@@ -2151,6 +2151,12 @@ TARGET_BUILTIN(__builtin_ia32_vsm3msg1, "V4UiV4UiV4UiV4Ui", "nV:128:", "sm3")
 TARGET_BUILTIN(__builtin_ia32_vsm3msg2, "V4UiV4UiV4UiV4Ui", "nV:128:", "sm3")
 TARGET_BUILTIN(__builtin_ia32_vsm3rnds2, "V4UiV4UiV4UiV4UiIUi", "nV:128:", "sm3")
 
+// SM4
+TARGET_BUILTIN(__builtin_ia32_vsm4key4128, "V4UiV4UiV4Ui", "nV:128:", "sm4")
+TARGET_BUILTIN(__builtin_ia32_vsm4key4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
+TARGET_BUILTIN(__builtin_ia32_vsm4rnds4128, "V4UiV4UiV4Ui", "nV:128:", "sm4")
+TARGET_BUILTIN(__builtin_ia32_vsm4rnds4256, "V8UiV8UiV8Ui", "nV:256:", "sm4")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
 #undef TARGET_HEADER_BUILTIN
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
@@ -5060,6 +5060,8 @@ def msha512 : Flag<["-"], "msha512">, Group<m_x86_Features_Group>;
 def mno_sha512 : Flag<["-"], "mno-sha512">, Group<m_x86_Features_Group>;
 def msm3 : Flag<["-"], "msm3">, Group<m_x86_Features_Group>;
 def mno_sm3 : Flag<["-"], "mno-sm3">, Group<m_x86_Features_Group>;
+def msm4 : Flag<["-"], "msm4">, Group<m_x86_Features_Group>;
+def mno_sm4 : Flag<["-"], "mno-sm4">, Group<m_x86_Features_Group>;
 def mtbm : Flag<["-"], "mtbm">, Group<m_x86_Features_Group>;
 def mno_tbm : Flag<["-"], "mno-tbm">, Group<m_x86_Features_Group>;
 def mtsxldtrk : Flag<["-"], "mtsxldtrk">, Group<m_x86_Features_Group>;

diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
@@ -267,6 +267,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasSHSTK = true;
     } else if (Feature == "+sm3") {
       HasSM3 = true;
+    } else if (Feature == "+sm4") {
+      HasSM4 = true;
     } else if (Feature == "+movbe") {
       HasMOVBE = true;
     } else if (Feature == "+sgx") {
@@ -780,6 +782,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__SGX__");
   if (HasSM3)
     Builder.defineMacro("__SM3__");
+  if (HasSM4)
+    Builder.defineMacro("__SM4__");
   if (HasPREFETCHI)
     Builder.defineMacro("__PREFETCHI__");
   if (HasPREFETCHWT1)
@@ -1010,6 +1014,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("sha512", true)
       .Case("shstk", true)
       .Case("sm3", true)
+      .Case("sm4", true)
       .Case("sse", true)
       .Case("sse2", true)
       .Case("sse3", true)
@@ -1117,6 +1122,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("sha512", HasSHA512)
       .Case("shstk", HasSHSTK)
       .Case("sm3", HasSM3)
+      .Case("sm4", HasSM4)
       .Case("sse", SSELevel >= SSE1)
       .Case("sse2", SSELevel >= SSE2)
       .Case("sse3", SSELevel >= SSE3)

diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
@@ -116,6 +116,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasSHSTK = false;
   bool HasSM3 = false;
   bool HasSGX = false;
+  bool HasSM4 = false;
   bool HasCX8 = false;
   bool HasCX16 = false;
   bool HasFXSR = false;

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
@@ -206,6 +206,7 @@ set(x86_files
   sha512intrin.h
   shaintrin.h
   sm3intrin.h
+  sm4intrin.h
   smmintrin.h
   tbmintrin.h
   tmmintrin.h

diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
@@ -279,6 +279,11 @@
 #include <sm3intrin.h>
 #endif
 
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__SM4__)
+#include <sm4intrin.h>
+#endif
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__RDPID__)
 /// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).

diff --git a/clang/lib/Headers/sm4intrin.h b/clang/lib/Headers/sm4intrin.h
@@ -0,0 +1,269 @@
+/*===--------------- sm4intrin.h - SM4 intrinsics -----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <sm4intrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __SM4INTRIN_H
+#define __SM4INTRIN_H
+
+/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic
+///    operates on independent 128-bit lanes. The calculated results are
+///    stored in \a dst.
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_sm4key4_epi32(__m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VSM4KEY4 instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x int].
+/// \param __B
+///    A 128-bit vector of [4 x int].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// DEFINE ROL32(dword, n) {
+/// 	count := n % 32
+/// 	dest := (dword << count) | (dword >> (32-count))
+/// 	RETURN dest
+/// }
+/// DEFINE SBOX_BYTE(dword, i) {
+/// 	RETURN sbox[dword.byte[i]]
+/// }
+/// DEFINE lower_t(dword) {
+/// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
+/// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
+/// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
+/// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
+/// 	RETURN tmp
+/// }
+/// DEFINE L_KEY(dword) {
+/// 	RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
+/// }
+/// DEFINE T_KEY(dword) {
+/// 	RETURN L_KEY(lower_t(dword))
+/// }
+/// DEFINE F_KEY(X0, X1, X2, X3, round_key) {
+/// 	RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
+/// }
+/// FOR i:= 0 to 0
+/// 	P[0] := __B.xmm[i].dword[0]
+/// 	P[1] := __B.xmm[i].dword[1]
+/// 	P[2] := __B.xmm[i].dword[2]
+/// 	P[3] := __B.xmm[i].dword[3]
+/// 	C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
+/// 	C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
+/// 	C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
+/// 	C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
+/// 	DEST.xmm[i].dword[0] := C[0]
+/// 	DEST.xmm[i].dword[1] := C[1]
+/// 	DEST.xmm[i].dword[2] := C[2]
+/// 	DEST.xmm[i].dword[3] := C[3]
+/// ENDFOR
+/// DEST[MAX:128] := 0
+/// \endcode
+#define _mm_sm4key4_epi32(A, B)                                                \
+  (__m128i) __builtin_ia32_vsm4key4128((__v4su)A, (__v4su)B)
+
+/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic
+///    operates on independent 128-bit lanes. The calculated results are
+///    stored in \a dst.
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_sm4key4_epi32(__m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VSM4KEY4 instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x int].
+/// \param __B
+///    A 256-bit vector of [8 x int].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// DEFINE ROL32(dword, n) {
+/// 	count := n % 32
+/// 	dest := (dword << count) | (dword >> (32-count))
+/// 	RETURN dest
+/// }
+/// DEFINE SBOX_BYTE(dword, i) {
+/// 	RETURN sbox[dword.byte[i]]
+/// }
+/// DEFINE lower_t(dword) {
+/// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
+/// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
+/// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
+/// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
+/// 	RETURN tmp
+/// }
+/// DEFINE L_KEY(dword) {
+/// 	RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
+/// }
+/// DEFINE T_KEY(dword) {
+/// 	RETURN L_KEY(lower_t(dword))
+/// }
+/// DEFINE F_KEY(X0, X1, X2, X3, round_key) {
+/// 	RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
+/// }
+/// FOR i:= 0 to 1
+/// 	P[0] := __B.xmm[i].dword[0]
+/// 	P[1] := __B.xmm[i].dword[1]
+/// 	P[2] := __B.xmm[i].dword[2]
+/// 	P[3] := __B.xmm[i].dword[3]
+/// 	C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
+/// 	C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
+/// 	C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
+/// 	C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
+/// 	DEST.xmm[i].dword[0] := C[0]
+/// 	DEST.xmm[i].dword[1] := C[1]
+/// 	DEST.xmm[i].dword[2] := C[2]
+/// 	DEST.xmm[i].dword[3] := C[3]
+/// ENDFOR
+/// DEST[MAX:256] := 0
+/// \endcode
+#define _mm256_sm4key4_epi32(A, B)                                             \
+  (__m256i) __builtin_ia32_vsm4key4256((__v8su)A, (__v8su)B)
+
+/// This intrinisc performs four rounds of SM4 encryption. The intrinisc
+///    operates on independent 128-bit lanes. The calculated results are
+///    stored in \a dst.
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_sm4rnds4_epi32(__m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VSM4RNDS4 instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x int].
+/// \param __B
+///    A 128-bit vector of [4 x int].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// DEFINE ROL32(dword, n) {
+/// 	count := n % 32
+/// 	dest := (dword << count) | (dword >> (32-count))
+/// 	RETURN dest
+/// }
+/// DEFINE lower_t(dword) {
+/// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
+/// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
+/// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
+/// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
+/// 	RETURN tmp
+/// }
+/// DEFINE L_RND(dword) {
+/// 	tmp := dword
+/// 	tmp := tmp ^ ROL32(dword, 2)
+/// 	tmp := tmp ^ ROL32(dword, 10)
+/// 	tmp := tmp ^ ROL32(dword, 18)
+/// 	tmp := tmp ^ ROL32(dword, 24)
+///   RETURN tmp
+/// }
+/// DEFINE T_RND(dword) {
+/// 	RETURN L_RND(lower_t(dword))
+/// }
+/// DEFINE F_RND(X0, X1, X2, X3, round_key) {
+/// 	RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
+/// }
+/// FOR i:= 0 to 0
+/// 	P[0] := __B.xmm[i].dword[0]
+/// 	P[1] := __B.xmm[i].dword[1]
+/// 	P[2] := __B.xmm[i].dword[2]
+/// 	P[3] := __B.xmm[i].dword[3]
+/// 	C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
+/// 	C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
+/// 	C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
+/// 	C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
+/// 	DEST.xmm[i].dword[0] := C[0]
+/// 	DEST.xmm[i].dword[1] := C[1]
+/// 	DEST.xmm[i].dword[2] := C[2]
+/// 	DEST.xmm[i].dword[3] := C[3]
+/// ENDFOR
+/// DEST[MAX:128] := 0
+/// \endcode
+#define _mm_sm4rnds4_epi32(A, B)                                               \
+  (__m128i) __builtin_ia32_vsm4rnds4128((__v4su)A, (__v4su)B)
+
+/// This intrinisc performs four rounds of SM4 encryption. The intrinisc
+///    operates on independent 128-bit lanes. The calculated results are
+///    stored in \a dst.
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_sm4rnds4_epi32(__m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VSM4RNDS4 instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x int].
+/// \param __B
+///    A 256-bit vector of [8 x int].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// DEFINE ROL32(dword, n) {
+/// 	count := n % 32
+/// 	dest := (dword << count) | (dword >> (32-count))
+/// 	RETURN dest
+/// }
+/// DEFINE lower_t(dword) {
+/// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
+/// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
+/// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
+/// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
+/// 	RETURN tmp
+/// }
+/// DEFINE L_RND(dword) {
+/// 	tmp := dword
+/// 	tmp := tmp ^ ROL32(dword, 2)
+/// 	tmp := tmp ^ ROL32(dword, 10)
+/// 	tmp := tmp ^ ROL32(dword, 18)
+/// 	tmp := tmp ^ ROL32(dword, 24)
+///   RETURN tmp
+/// }
+/// DEFINE T_RND(dword) {
+/// 	RETURN L_RND(lower_t(dword))
+/// }
+/// DEFINE F_RND(X0, X1, X2, X3, round_key) {
+/// 	RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
+/// }
+/// FOR i:= 0 to 0
+/// 	P[0] := __B.xmm[i].dword[0]
+/// 	P[1] := __B.xmm[i].dword[1]
+/// 	P[2] := __B.xmm[i].dword[2]
+/// 	P[3] := __B.xmm[i].dword[3]
+/// 	C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
+/// 	C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
+/// 	C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
+/// 	C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
+/// 	DEST.xmm[i].dword[0] := C[0]
+/// 	DEST.xmm[i].dword[1] := C[1]
+/// 	DEST.xmm[i].dword[2] := C[2]
+/// 	DEST.xmm[i].dword[3] := C[3]
+/// ENDFOR
+/// DEST[MAX:256] := 0
+/// \endcode
+#define _mm256_sm4rnds4_epi32(A, B)                                            \
+  (__m256i) __builtin_ia32_vsm4rnds4256((__v8su)A, (__v8su)B)
+
+#endif // __SM4INTRIN_H
diff --git a/clang/test/CodeGen/X86/sm4-builtins.c b/clang/test/CodeGen/X86/sm4-builtins.c
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +sm4 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +sm4 -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#include <immintrin.h>
+
+__m128i test_mm_sm4key4_epi32(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_sm4key4_epi32(
+  // CHECK: call <4 x i32> @llvm.x86.vsm4key4128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_sm4key4_epi32(__A, __B);
+}
+
+__m256i test_mm256_sm4key4_epi32(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_sm4key4_epi32(
+  // CHECK: call <8 x i32> @llvm.x86.vsm4key4256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_sm4key4_epi32(__A, __B);
+}
+
+__m128i test_mm_sm4rnds4_epi32(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_sm4rnds4_epi32(
+  // CHECK: call <4 x i32> @llvm.x86.vsm4rnds4128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_sm4rnds4_epi32(__A, __B);
+}
+
+__m256i test_mm256_sm4rnds4_epi32(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_sm4rnds4_epi32(
+  // CHECK: call <8 x i32> @llvm.x86.vsm4rnds4256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_sm4rnds4_epi32(__A, __B);
+}