diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def index a0926f230d46f9..f21c17ee0ebe91 100644 --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -1876,6 +1876,47 @@ TARGET_BUILTIN(__builtin_ia32_cmpsh_mask, "UcV8xV8xIiUcIi", "ncV:128:", "avx512f TARGET_BUILTIN(__builtin_ia32_loadsh128_mask, "V8xV8x*V8xUc", "nV:128:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_storesh128_mask, "vV8x*V8xUc", "nV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_rcpph128_mask, "V8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_rcpph256_mask, "V16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_rcpph512_mask, "V32xV32xV32xUi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_rsqrtph128_mask, "V8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_rsqrtph256_mask, "V16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_rsqrtph512_mask, "V32xV32xV32xUi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_getmantph128_mask, "V8xV8xIiV8xUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_getmantph256_mask, "V16xV16xIiV16xUs", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_getmantph512_mask, "V32xV32xIiV32xUiIi", "ncV:512:", "avx512fp16") + +TARGET_BUILTIN(__builtin_ia32_getexpph128_mask, "V8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_getexpph256_mask, "V16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_getexpph512_mask, "V32xV32xV32xUiIi", "ncV:512:", "avx512fp16") + +TARGET_BUILTIN(__builtin_ia32_scalefph128_mask, "V8xV8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_scalefph256_mask, "V16xV16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_scalefph512_mask, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16") + +TARGET_BUILTIN(__builtin_ia32_rndscaleph_128_mask, "V8xV8xIiV8xUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_rndscaleph_256_mask, "V16xV16xIiV16xUs", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_rndscaleph_mask, "V32xV32xIiV32xUiIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_reduceph128_mask, "V8xV8xIiV8xUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_reduceph256_mask, "V16xV16xIiV16xUs", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_reduceph512_mask, "V32xV32xIiV32xUiIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_rcpsh_mask, "V8xV8xV8xV8xUc", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_rsqrtsh_mask, "V8xV8xV8xV8xUc", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_getmantsh_round_mask, "V8xV8xV8xIiV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_getexpsh128_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_scalefsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_rndscalesh_round_mask, "V8xV8xV8xV8xUcIiIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_reducesh_mask, "V8xV8xV8xV8xUcIiIi", "ncV:128:", "avx512fp16") + +TARGET_BUILTIN(__builtin_ia32_sqrtph, "V8xV8x", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_sqrtph256, "V16xV16x", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_sqrtph512, "V32xV32xIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_sqrtsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_fpclassph128_mask, "UcV8xIiUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_fpclassph256_mask, "UsV16xIiUs", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_fpclassph512_mask, "UiV32xIiUi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_fpclasssh_mask, "UcV8xIiUc", "ncV:128:", "avx512fp16") + TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph128_mask, "V8xV2dV8xUc", "ncV:128:", "avx512fp16,avx512vl") TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph256_mask, "V8xV4dV8xUc", "ncV:256:", "avx512fp16,avx512vl") TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph512_mask, "V8xV8dV8xUcIi", "ncV:512:", "avx512fp16") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 89b773fc5f97b5..789c446940ce8a 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -13959,15 +13959,28 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, } return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0); } + case X86::BI__builtin_ia32_sqrtsh_round_mask: case X86::BI__builtin_ia32_sqrtsd_round_mask: case X86::BI__builtin_ia32_sqrtss_round_mask: { unsigned CC = cast(Ops[4])->getZExtValue(); // Support only if the rounding mode is 4 (AKA CUR_DIRECTION), // otherwise keep the intrinsic. if (CC != 4) { - Intrinsic::ID IID = BuiltinID == X86::BI__builtin_ia32_sqrtsd_round_mask ? - Intrinsic::x86_avx512_mask_sqrt_sd : - Intrinsic::x86_avx512_mask_sqrt_ss; + Intrinsic::ID IID; + + switch (BuiltinID) { + default: + llvm_unreachable("Unsupported intrinsic!"); + case X86::BI__builtin_ia32_sqrtsh_round_mask: + IID = Intrinsic::x86_avx512fp16_mask_sqrt_sh; + break; + case X86::BI__builtin_ia32_sqrtsd_round_mask: + IID = Intrinsic::x86_avx512_mask_sqrt_sd; + break; + case X86::BI__builtin_ia32_sqrtss_round_mask: + IID = Intrinsic::x86_avx512_mask_sqrt_ss; + break; + } return Builder.CreateCall(CGM.getIntrinsic(IID), Ops); } Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0); @@ -13989,6 +14002,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_sqrtpd: case X86::BI__builtin_ia32_sqrtps256: case X86::BI__builtin_ia32_sqrtps: + case X86::BI__builtin_ia32_sqrtph256: + case X86::BI__builtin_ia32_sqrtph: + case X86::BI__builtin_ia32_sqrtph512: case X86::BI__builtin_ia32_sqrtps512: case X86::BI__builtin_ia32_sqrtpd512: { if (Ops.size() == 2) { @@ -13996,9 +14012,21 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, // Support only if the rounding mode is 4 (AKA CUR_DIRECTION), // otherwise keep the intrinsic. if (CC != 4) { - Intrinsic::ID IID = BuiltinID == X86::BI__builtin_ia32_sqrtps512 ? - Intrinsic::x86_avx512_sqrt_ps_512 : - Intrinsic::x86_avx512_sqrt_pd_512; + Intrinsic::ID IID; + + switch (BuiltinID) { + default: + llvm_unreachable("Unsupported intrinsic!"); + case X86::BI__builtin_ia32_sqrtph512: + IID = Intrinsic::x86_avx512fp16_sqrt_ph_512; + break; + case X86::BI__builtin_ia32_sqrtps512: + IID = Intrinsic::x86_avx512_sqrt_ps_512; + break; + case X86::BI__builtin_ia32_sqrtpd512: + IID = Intrinsic::x86_avx512_sqrt_pd_512; + break; + } return Builder.CreateCall(CGM.getIntrinsic(IID), Ops); } } @@ -14315,6 +14343,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_fpclassps128_mask: case X86::BI__builtin_ia32_fpclassps256_mask: case X86::BI__builtin_ia32_fpclassps512_mask: + case X86::BI__builtin_ia32_fpclassph128_mask: + case X86::BI__builtin_ia32_fpclassph256_mask: + case X86::BI__builtin_ia32_fpclassph512_mask: case X86::BI__builtin_ia32_fpclasspd128_mask: case X86::BI__builtin_ia32_fpclasspd256_mask: case X86::BI__builtin_ia32_fpclasspd512_mask: { @@ -14326,6 +14357,15 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, Intrinsic::ID ID; switch (BuiltinID) { default: llvm_unreachable("Unsupported intrinsic!"); + case X86::BI__builtin_ia32_fpclassph128_mask: + ID = Intrinsic::x86_avx512fp16_fpclass_ph_128; + break; + case X86::BI__builtin_ia32_fpclassph256_mask: + ID = Intrinsic::x86_avx512fp16_fpclass_ph_256; + break; + case X86::BI__builtin_ia32_fpclassph512_mask: + ID = Intrinsic::x86_avx512fp16_fpclass_ph_512; + break; case X86::BI__builtin_ia32_fpclassps128_mask: ID = Intrinsic::x86_avx512_fpclass_ps_128; break; diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h index 6a4a9d4a6c7eb8..48370d0bf0ee06 100644 --- a/clang/lib/Headers/avx512fp16intrin.h +++ b/clang/lib/Headers/avx512fp16intrin.h @@ -947,6 +947,492 @@ static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) { return __b[0]; } +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) { + return (__m512h)__builtin_ia32_rcpph512_mask( + (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) { + return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W, + (__mmask32)__U); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) { + return (__m512h)__builtin_ia32_rcpph512_mask( + (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) { + return (__m512h)__builtin_ia32_rsqrtph512_mask( + (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) { + return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W, + (__mmask32)__U); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) { + return (__m512h)__builtin_ia32_rsqrtph512_mask( + (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U); +} + +#define _mm512_getmant_ph(A, B, C) \ + ((__m512h)__builtin_ia32_getmantph512_mask( \ + (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ + (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_getmant_ph(W, U, A, B, C) \ + ((__m512h)__builtin_ia32_getmantph512_mask( \ + (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \ + (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_getmant_ph(U, A, B, C) \ + ((__m512h)__builtin_ia32_getmantph512_mask( \ + (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ + (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_getmant_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_getmantph512_mask( \ + (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ + (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R))) + +#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \ + ((__m512h)__builtin_ia32_getmantph512_mask( \ + (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_getmantph512_mask( \ + (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ + (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) { + return (__m512h)__builtin_ia32_getexpph512_mask( + (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) { + return (__m512h)__builtin_ia32_getexpph512_mask( + (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) { + return (__m512h)__builtin_ia32_getexpph512_mask( + (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_getexp_round_ph(A, R) \ + ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \ + (__v32hf)_mm512_undefined_ph(), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_mask_getexp_round_ph(W, U, A, R) \ + ((__m512h)__builtin_ia32_getexpph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_getexp_round_ph(U, A, R) \ + ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \ + (__v32hf)_mm512_setzero_ph(), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A, + __m512h __B) { + return (__m512h)__builtin_ia32_scalefph512_mask( + (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B, + (__v32hf)__W, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_scalefph512_mask( + (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_scalef_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_scalefph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \ + (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R))) + +#define _mm512_mask_scalef_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_scalefph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_scalef_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_scalefph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \ + (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) + +#define _mm512_roundscale_ph(A, B) \ + ((__m512h)__builtin_ia32_rndscaleph_mask( \ + (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_roundscale_ph(A, B, C, imm) \ + ((__m512h)__builtin_ia32_rndscaleph_mask( \ + (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \ + (__mmask32)(B), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_roundscale_ph(A, B, imm) \ + ((__m512h)__builtin_ia32_rndscaleph_mask( \ + (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \ + (__mmask32)(A), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \ + ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \ + (__v32hf)(__m512h)(A), \ + (__mmask32)(B), (int)(R))) + +#define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \ + ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \ + (__v32hf)_mm512_setzero_ph(), \ + (__mmask32)(A), (int)(R))) + +#define _mm512_roundscale_round_ph(A, imm, R) \ + ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \ + (__v32hf)_mm512_undefined_ph(), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_reduce_ph(A, imm) \ + ((__m512h)__builtin_ia32_reduceph512_mask( \ + (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \ + (__mmask32)-1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_reduce_ph(W, U, A, imm) \ + ((__m512h)__builtin_ia32_reduceph512_mask( \ + (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \ + (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_reduce_ph(U, A, imm) \ + ((__m512h)__builtin_ia32_reduceph512_mask( \ + (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \ + (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \ + ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \ + (__v32hf)(__m512h)(W), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_reduce_round_ph(U, A, imm, R) \ + ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \ + (__v32hf)_mm512_setzero_ph(), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_reduce_round_ph(A, imm, R) \ + ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \ + (__v32hf)_mm512_undefined_ph(), \ + (__mmask32)-1, (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_rcpsh_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B, + (__v8hf)__W, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_rcpsh_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_rsqrtsh_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B, + (__v8hf)__W, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_rsqrtsh_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +#define _mm_getmant_round_sh(A, B, C, D, R) \ + ((__m128h)__builtin_ia32_getmantsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ + (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R))) + +#define _mm_getmant_sh(A, B, C, D) \ + ((__m128h)__builtin_ia32_getmantsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ + (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getmant_sh(W, U, A, B, C, D) \ + ((__m128h)__builtin_ia32_getmantsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ + (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \ + ((__m128h)__builtin_ia32_getmantsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ + (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R))) + +#define _mm_maskz_getmant_sh(U, A, B, C, D) \ + ((__m128h)__builtin_ia32_getmantsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ + (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \ + ((__m128h)__builtin_ia32_getmantsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ + (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) + +#define _mm_getexp_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_getexpsh128_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_getexpsh128_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_getexpsh128_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask_getexp_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_getexpsh128_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_getexpsh128_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_getexp_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_getexpsh128_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +#define _mm_scalef_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_scalefsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_scalefsh_round_mask( + (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B, + (__v8hf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask_scalef_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_scalefsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_scalefsh_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_scalef_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_scalefsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +#define _mm_roundscale_round_sh(A, B, imm, R) \ + ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(imm), (int)(R))) + +#define _mm_roundscale_sh(A, B, imm) \ + ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_roundscale_sh(W, U, A, B, I) \ + ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \ + ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(I), (int)(R))) + +#define _mm_maskz_roundscale_sh(U, A, B, I) \ + ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \ + ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(I), (int)(R))) + +#define _mm_reduce_sh(A, B, C) \ + ((__m128h)__builtin_ia32_reducesh_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_reduce_sh(W, U, A, B, C) \ + ((__m128h)__builtin_ia32_reducesh_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_reduce_sh(U, A, B, C) \ + ((__m128h)__builtin_ia32_reducesh_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_reduce_round_sh(A, B, C, R) \ + ((__m128h)__builtin_ia32_reducesh_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(C), (int)(R))) + +#define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \ + ((__m128h)__builtin_ia32_reducesh_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(C), (int)(R))) + +#define _mm_maskz_reduce_round_sh(U, A, B, C, R) \ + ((__m128h)__builtin_ia32_reducesh_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(C), (int)(R))) + +#define _mm512_sqrt_round_ph(A, R) \ + ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R))) + +#define _mm512_mask_sqrt_round_ph(W, U, A, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_sqrt_round_ph(U, A, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \ + (__v32hf)_mm512_setzero_ph())) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) { + return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)(__U), + (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)), + (__v32hf)(__m512h)(__W)); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)(__U), + (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm_sqrt_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_sqrtsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_sqrt_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_sqrtsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_sqrt_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_sqrtsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_sqrtsh_round_mask( + (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(), + (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W, + __mmask32 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_sqrtsh_round_mask( + (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W), + (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_sqrtsh_round_mask( + (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(), + (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_mask_fpclass_ph_mask(U, A, imm) \ + ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \ + (int)(imm), (__mmask32)(U))) + +#define _mm512_fpclass_ph_mask(A, imm) \ + ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \ + (int)(imm), (__mmask32)-1)) + +#define _mm_fpclass_sh_mask(A, imm) \ + ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \ + (__mmask8)-1)) + +#define _mm_mask_fpclass_sh_mask(U, A, imm) \ + ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \ + (__mmask8)(U))) + #define _mm512_cvt_roundpd_ph(A, R) \ ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \ (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h index ab2cf436ee16d9..1809211fd4066b 100644 --- a/clang/lib/Headers/avx512vlfp16intrin.h +++ b/clang/lib/Headers/avx512vlfp16intrin.h @@ -327,6 +327,284 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) { ((__mmask8)__builtin_ia32_cmpph128_mask( \ (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m))) +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rcp_ph(__m256h __A) { + return (__m256h)__builtin_ia32_rcpph256_mask( + (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_rcp_ph(__m256h __W, __mmask16 __U, __m256h __A) { + return (__m256h)__builtin_ia32_rcpph256_mask((__v16hf)__A, (__v16hf)__W, + (__mmask16)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_rcp_ph(__mmask16 __U, __m256h __A) { + return (__m256h)__builtin_ia32_rcpph256_mask( + (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_ph(__m128h __A) { + return (__m128h)__builtin_ia32_rcpph128_mask( + (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_ph(__m128h __W, + __mmask8 __U, + __m128h __A) { + return (__m128h)__builtin_ia32_rcpph128_mask((__v8hf)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_ph(__mmask8 __U, + __m128h __A) { + return (__m128h)__builtin_ia32_rcpph128_mask( + (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rsqrt_ph(__m256h __A) { + return (__m256h)__builtin_ia32_rsqrtph256_mask( + (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) { + return (__m256h)__builtin_ia32_rsqrtph256_mask((__v16hf)__A, (__v16hf)__W, + (__mmask16)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) { + return (__m256h)__builtin_ia32_rsqrtph256_mask( + (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_ph(__m128h __A) { + return (__m128h)__builtin_ia32_rsqrtph128_mask( + (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_ph(__m128h __W, + __mmask8 __U, + __m128h __A) { + return (__m128h)__builtin_ia32_rsqrtph128_mask((__v8hf)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_rsqrt_ph(__mmask8 __U, __m128h __A) { + return (__m128h)__builtin_ia32_rsqrtph128_mask( + (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_ph(__m128h __A) { + return (__m128h)__builtin_ia32_getexpph128_mask( + (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) { + return (__m128h)__builtin_ia32_getexpph128_mask((__v8hf)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) { + return (__m128h)__builtin_ia32_getexpph128_mask( + (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_getexp_ph(__m256h __A) { + return (__m256h)__builtin_ia32_getexpph256_mask( + (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) { + return (__m256h)__builtin_ia32_getexpph256_mask((__v16hf)__A, (__v16hf)__W, + (__mmask16)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) { + return (__m256h)__builtin_ia32_getexpph256_mask( + (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); +} + +#define _mm_getmant_ph(A, B, C) \ + ((__m128h)__builtin_ia32_getmantph128_mask( \ + (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1)) + +#define _mm_mask_getmant_ph(W, U, A, B, C) \ + ((__m128h)__builtin_ia32_getmantph128_mask( \ + (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_getmant_ph(U, A, B, C) \ + ((__m128h)__builtin_ia32_getmantph128_mask( \ + (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U))) + +#define _mm256_getmant_ph(A, B, C) \ + ((__m256h)__builtin_ia32_getmantph256_mask( \ + (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \ + (__v16hf)_mm256_setzero_ph(), (__mmask16)-1)) + +#define _mm256_mask_getmant_ph(W, U, A, B, C) \ + ((__m256h)__builtin_ia32_getmantph256_mask( \ + (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \ + (__mmask16)(U))) + +#define _mm256_maskz_getmant_ph(U, A, B, C) \ + ((__m256h)__builtin_ia32_getmantph256_mask( \ + (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \ + (__v16hf)_mm256_setzero_ph(), (__mmask16)(U))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_ph(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_scalefph128_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_scalefph128_mask((__v8hf)__A, (__v8hf)__B, + (__v8hf)__W, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_scalefph128_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_scalef_ph(__m256h __A, + __m256h __B) { + return (__m256h)__builtin_ia32_scalefph256_mask( + (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_scalefph256_mask((__v16hf)__A, (__v16hf)__B, + (__v16hf)__W, (__mmask16)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_scalefph256_mask( + (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); +} + +#define _mm_roundscale_ph(A, imm) \ + ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ + (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1)) + +#define _mm_mask_roundscale_ph(W, U, A, imm) \ + ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ + (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U))) + +#define _mm_maskz_roundscale_ph(U, A, imm) \ + ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ + (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U))) + +#define _mm256_roundscale_ph(A, imm) \ + ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ + (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \ + (__mmask16)-1)) + +#define _mm256_mask_roundscale_ph(W, U, A, imm) \ + ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ + (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \ + (__mmask16)(U))) + +#define _mm256_maskz_roundscale_ph(U, A, imm) \ + ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ + (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \ + (__mmask16)(U))) + +#define _mm_reduce_ph(A, imm) \ + ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \ + (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1)) + +#define _mm_mask_reduce_ph(W, U, A, imm) \ + ((__m128h)__builtin_ia32_reduceph128_mask( \ + (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U))) + +#define _mm_maskz_reduce_ph(U, A, imm) \ + ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \ + (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U))) + +#define _mm256_reduce_ph(A, imm) \ + ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ + (__v16hf)_mm256_setzero_ph(), \ + (__mmask16)-1)) + +#define _mm256_mask_reduce_ph(W, U, A, imm) \ + ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ + (__v16hf)(__m256h)(W), \ + (__mmask16)(U))) + +#define _mm256_maskz_reduce_ph(U, A, imm) \ + ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ + (__v16hf)_mm256_setzero_ph(), \ + (__mmask16)(U))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) { + return __builtin_ia32_sqrtph((__v8hf)__a); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W, + __mmask8 __U, + __m128h __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U, + __m128h __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)_mm_setzero_ph()); +} + +static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) { + return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, (__v16hf)_mm256_sqrt_ph(__A), (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) { + return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, + (__v16hf)_mm256_sqrt_ph(__A), + (__v16hf)_mm256_setzero_ph()); +} + +#define _mm_mask_fpclass_ph_mask(U, A, imm) \ + ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \ + (int)(imm), (__mmask8)(U))) + +#define _mm_fpclass_ph_mask(A, imm) \ + ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \ + (int)(imm), (__mmask8)-1)) + +#define _mm256_mask_fpclass_ph_mask(U, A, imm) \ + ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \ + (int)(imm), (__mmask16)(U))) + +#define _mm256_fpclass_ph_mask(A, imm) \ + ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \ + (int)(imm), (__mmask16)-1)) + static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpd_ph(__m128d __A) { return (__m128h)__builtin_ia32_vcvtpd2ph128_mask( (__v2df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 69560027f330a7..4e7d5b66bca7f9 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3913,6 +3913,7 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_exp2ps_mask: case X86::BI__builtin_ia32_getexppd512_mask: case X86::BI__builtin_ia32_getexpps512_mask: + case X86::BI__builtin_ia32_getexpph512_mask: case X86::BI__builtin_ia32_rcp28pd_mask: case X86::BI__builtin_ia32_rcp28ps_mask: case X86::BI__builtin_ia32_rsqrt28pd_mask: @@ -3933,8 +3934,10 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_cvtss2sd_round_mask: case X86::BI__builtin_ia32_getexpsd128_round_mask: case X86::BI__builtin_ia32_getexpss128_round_mask: + case X86::BI__builtin_ia32_getexpsh128_round_mask: case X86::BI__builtin_ia32_getmantpd512_mask: case X86::BI__builtin_ia32_getmantps512_mask: + case X86::BI__builtin_ia32_getmantph512_mask: case X86::BI__builtin_ia32_maxsd_round_mask: case X86::BI__builtin_ia32_maxss_round_mask: case X86::BI__builtin_ia32_maxsh_round_mask: @@ -3945,8 +3948,10 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_rcp28ss_round_mask: case X86::BI__builtin_ia32_reducepd512_mask: case X86::BI__builtin_ia32_reduceps512_mask: + case X86::BI__builtin_ia32_reduceph512_mask: case X86::BI__builtin_ia32_rndscalepd_mask: case X86::BI__builtin_ia32_rndscaleps_mask: + case X86::BI__builtin_ia32_rndscaleph_mask: case X86::BI__builtin_ia32_rsqrt28sd_round_mask: case X86::BI__builtin_ia32_rsqrt28ss_round_mask: ArgNum = 4; @@ -3961,14 +3966,17 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_fixupimmss_maskz: case X86::BI__builtin_ia32_getmantsd_round_mask: case X86::BI__builtin_ia32_getmantss_round_mask: + case X86::BI__builtin_ia32_getmantsh_round_mask: case X86::BI__builtin_ia32_rangepd512_mask: case X86::BI__builtin_ia32_rangeps512_mask: case X86::BI__builtin_ia32_rangesd128_round_mask: case X86::BI__builtin_ia32_rangess128_round_mask: case X86::BI__builtin_ia32_reducesd_mask: case X86::BI__builtin_ia32_reducess_mask: + case X86::BI__builtin_ia32_reducesh_mask: case X86::BI__builtin_ia32_rndscalesd_round_mask: case X86::BI__builtin_ia32_rndscaless_round_mask: + case X86::BI__builtin_ia32_rndscalesh_round_mask: ArgNum = 5; break; case X86::BI__builtin_ia32_vcvtsd2si64: @@ -3985,6 +3993,7 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_vcvtsh2usi64: case X86::BI__builtin_ia32_sqrtpd512: case X86::BI__builtin_ia32_sqrtps512: + case X86::BI__builtin_ia32_sqrtph512: ArgNum = 1; HasRC = true; break; @@ -4057,15 +4066,18 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_subsh_round_mask: case X86::BI__builtin_ia32_subss_round_mask: case X86::BI__builtin_ia32_subsd_round_mask: + case X86::BI__builtin_ia32_scalefph512_mask: case X86::BI__builtin_ia32_scalefpd512_mask: case X86::BI__builtin_ia32_scalefps512_mask: case X86::BI__builtin_ia32_scalefsd_round_mask: case X86::BI__builtin_ia32_scalefss_round_mask: + case X86::BI__builtin_ia32_scalefsh_round_mask: case X86::BI__builtin_ia32_cvtsd2ss_round_mask: case X86::BI__builtin_ia32_vcvtss2sh_round_mask: case X86::BI__builtin_ia32_vcvtsd2sh_round_mask: case X86::BI__builtin_ia32_sqrtsd_round_mask: case X86::BI__builtin_ia32_sqrtss_round_mask: + case X86::BI__builtin_ia32_sqrtsh_round_mask: case X86::BI__builtin_ia32_vfmaddsd3_mask: case X86::BI__builtin_ia32_vfmaddsd3_maskz: case X86::BI__builtin_ia32_vfmaddsd3_mask3: @@ -4439,6 +4451,9 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, case X86::BI__builtin_ia32_getmantps256_mask: case X86::BI__builtin_ia32_getmantpd512_mask: case X86::BI__builtin_ia32_getmantps512_mask: + case X86::BI__builtin_ia32_getmantph128_mask: + case X86::BI__builtin_ia32_getmantph256_mask: + case X86::BI__builtin_ia32_getmantph512_mask: case X86::BI__builtin_ia32_vec_ext_v16qi: case X86::BI__builtin_ia32_vec_ext_v16hi: i = 1; l = 0; u = 15; @@ -4457,6 +4472,7 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, case X86::BI__builtin_ia32_rangeps512_mask: case X86::BI__builtin_ia32_getmantsd_round_mask: case X86::BI__builtin_ia32_getmantss_round_mask: + case X86::BI__builtin_ia32_getmantsh_round_mask: case X86::BI__builtin_ia32_vec_set_v16qi: case X86::BI__builtin_ia32_vec_set_v16hi: i = 2; l = 0; u = 15; @@ -4509,12 +4525,16 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, case X86::BI__builtin_ia32_rndscalepd_256_mask: case X86::BI__builtin_ia32_rndscaleps_mask: case X86::BI__builtin_ia32_rndscalepd_mask: + case X86::BI__builtin_ia32_rndscaleph_mask: case X86::BI__builtin_ia32_reducepd128_mask: case X86::BI__builtin_ia32_reducepd256_mask: case X86::BI__builtin_ia32_reducepd512_mask: case X86::BI__builtin_ia32_reduceps128_mask: case X86::BI__builtin_ia32_reduceps256_mask: case X86::BI__builtin_ia32_reduceps512_mask: + case X86::BI__builtin_ia32_reduceph128_mask: + case X86::BI__builtin_ia32_reduceph256_mask: + case X86::BI__builtin_ia32_reduceph512_mask: case X86::BI__builtin_ia32_prold512: case X86::BI__builtin_ia32_prolq512: case X86::BI__builtin_ia32_prold128: @@ -4533,8 +4553,12 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, case X86::BI__builtin_ia32_fpclassps256_mask: case X86::BI__builtin_ia32_fpclassps512_mask: case X86::BI__builtin_ia32_fpclasspd512_mask: + case X86::BI__builtin_ia32_fpclassph128_mask: + case X86::BI__builtin_ia32_fpclassph256_mask: + case X86::BI__builtin_ia32_fpclassph512_mask: case X86::BI__builtin_ia32_fpclasssd_mask: case X86::BI__builtin_ia32_fpclassss_mask: + case X86::BI__builtin_ia32_fpclasssh_mask: case X86::BI__builtin_ia32_pslldqi128_byteshift: case X86::BI__builtin_ia32_pslldqi256_byteshift: case X86::BI__builtin_ia32_pslldqi512_byteshift: @@ -4645,6 +4669,8 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, case X86::BI__builtin_ia32_reducess_mask: case X86::BI__builtin_ia32_rndscalesd_round_mask: case X86::BI__builtin_ia32_rndscaless_round_mask: + case X86::BI__builtin_ia32_rndscalesh_round_mask: + case X86::BI__builtin_ia32_reducesh_mask: i = 4; l = 0; u = 255; break; } diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c index d4fe44bc259eeb..42591662606ebd 100644 --- a/clang/test/CodeGen/X86/avx512fp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c @@ -1542,6 +1542,537 @@ __m128i test_mm_cvtsi16_si128(short A) { return _mm_cvtsi16_si128(A); } +__m512h test_mm512_rcp_ph(__m512h __A) { + // CHECK-LABEL: @test_mm512_rcp_ph + // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.512 + return _mm512_rcp_ph(__A); +} + +__m512h test_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_mask_rcp_ph + // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.512 + return (__m512h)_mm512_mask_rcp_ph(__W, __U, __A); +} + +__m512h test_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_maskz_rcp_ph + // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.512 + return _mm512_maskz_rcp_ph(__U, __A); +} + +__m512h test_mm512_rsqrt_ph(__m512h __A) { + // CHECK-LABEL: @test_mm512_rsqrt_ph + // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.512 + return _mm512_rsqrt_ph(__A); +} + +__m512h test_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_mask_rsqrt_ph + // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.512 + return (__m512h)_mm512_mask_rsqrt_ph(__W, __U, __A); +} + +__m512h test_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_maskz_rsqrt_ph + // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.512 + return _mm512_maskz_rsqrt_ph(__U, __A); +} + +__m512h test_mm512_getmant_round_ph(__m512h __A) { + // CHECK-LABEL: @test_mm512_getmant_round_ph + // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512 + return _mm512_getmant_round_ph(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_getmant_round_ph(__m512h __W, __mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_mask_getmant_round_ph + // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512 + return _mm512_mask_getmant_round_ph(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_getmant_round_ph(__mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_maskz_getmant_round_ph + // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512 + return _mm512_maskz_getmant_round_ph(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_getmant_ph(__m512h __A) { + // CHECK-LABEL: @test_mm512_getmant_ph + // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512 + return _mm512_getmant_ph(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); +} + +__m512h test_mm512_mask_getmant_ph(__m512h __W, __mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_mask_getmant_ph + // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512 + return _mm512_mask_getmant_ph(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); +} + +__m512h test_mm512_maskz_getmant_ph(__mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_maskz_getmant_ph + // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512 + return _mm512_maskz_getmant_ph(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); +} + +__m512h test_mm512_scalef_round_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_scalef_round_ph + // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512 + return _mm512_scalef_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_scalef_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_scalef_round_ph + // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512 + return _mm512_mask_scalef_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_scalef_round_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_scalef_round_ph + // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512 + return _mm512_maskz_scalef_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_scalef_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_scalef_ph + // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512 + return _mm512_scalef_ph(__A, __B); +} + +__m512h test_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_scalef_ph + // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512 + return _mm512_mask_scalef_ph(__W, __U, __A, __B); +} + +__m512h test_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_scalef_ph + // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512 + return _mm512_maskz_scalef_ph(__U, __A, __B); +} + +__m512h test_mm512_mask_roundscale_ph(__m512h __W, __mmask16 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_mask_roundscale_ph + // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512 + return _mm512_mask_roundscale_ph(__W, __U, __A, 1); +} + +__m512h test_mm512_maskz_roundscale_ph(__mmask16 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_maskz_roundscale_ph + // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512 + return _mm512_maskz_roundscale_ph(__U, __A, 1); +} + +__m512h test_mm512_mask_roundscale_round_ph(__m512h __A, __mmask16 __U, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_roundscale_round_ph + // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512 + return _mm512_mask_roundscale_round_ph(__A, __U, __C, 3, _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_roundscale_round_ph(__m512h __A, __mmask16 __U) { + // CHECK-LABEL: @test_mm512_maskz_roundscale_round_ph + // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512 + return _mm512_maskz_roundscale_round_ph(__U, __A, 3, _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_roundscale_round_ph(__m512h __A) { + // CHECK-LABEL: @test_mm512_roundscale_round_ph + // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512 + return _mm512_roundscale_round_ph(__A, 3, _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_roundscale_ph(__m512h __A) { + // CHECK-LABEL: @test_mm512_roundscale_ph + // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512 + return _mm512_roundscale_ph(__A, 3); +} + +__m512h test_mm512_getexp_round_ph(__m512h __A) { + // CHECK-LABEL: @test_mm512_getexp_round_ph + // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512 + return _mm512_getexp_round_ph(__A, _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_getexp_round_ph(__m512h __W, __mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_mask_getexp_round_ph + // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512 + return _mm512_mask_getexp_round_ph(__W, __U, __A, _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_getexp_round_ph(__mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_maskz_getexp_round_ph + // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512 + return _mm512_maskz_getexp_round_ph(__U, __A, _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_getexp_ph(__m512h __A) { + // CHECK-LABEL: @test_mm512_getexp_ph + // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512 + return _mm512_getexp_ph(__A); +} + +__m512h test_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_mask_getexp_ph + // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512 + return _mm512_mask_getexp_ph(__W, __U, __A); +} + +__m512h test_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_maskz_getexp_ph + // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512 + return _mm512_maskz_getexp_ph(__U, __A); +} + +__m512h test_mm512_mask_reduce_ph(__m512h __W, __mmask16 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_mask_reduce_ph + // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512 + return _mm512_mask_reduce_ph(__W, __U, __A, 1); +} + +__m512h test_mm512_maskz_reduce_ph(__mmask16 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_maskz_reduce_ph + // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512 + return _mm512_maskz_reduce_ph(__U, __A, 1); +} + +__m512h test_mm512_mask_reduce_round_ph(__m512h __A, __mmask16 __U, __m512h __C) { + // CHECK-LABEL: @test_mm512_mask_reduce_round_ph + // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512 + return _mm512_mask_reduce_round_ph(__A, __U, __C, 3, _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_reduce_round_ph(__m512h __A, __mmask16 __U) { + // CHECK-LABEL: @test_mm512_maskz_reduce_round_ph + // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512 + return _mm512_maskz_reduce_round_ph(__U, __A, 3, _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_reduce_round_ph(__m512h __A) { + // CHECK-LABEL: @test_mm512_reduce_round_ph + // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512 + return _mm512_reduce_round_ph(__A, 3, _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_reduce_ph(__m512h __A) { + // CHECK-LABEL: @test_mm512_reduce_ph + // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512 + return _mm512_reduce_ph(__A, 3); +} +__m128h test_mm_rcp_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_rcp_sh + // CHECK: @llvm.x86.avx512fp16.mask.rcp.sh + return _mm_rcp_sh(__A, __B); +} + +__m128h test_mm_mask_rcp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_rcp_sh + // CHECK: @llvm.x86.avx512fp16.mask.rcp.sh + return _mm_mask_rcp_sh(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_rcp_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_rcp_sh + // CHECK: @llvm.x86.avx512fp16.mask.rcp.sh + return _mm_maskz_rcp_sh(__U, __A, __B); +} + +__m128h test_mm_rsqrt_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_rsqrt_sh + // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.sh + return _mm_rsqrt_sh(__A, __B); +} + +__m128h test_mm_mask_rsqrt_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_rsqrt_sh + // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.sh + return _mm_mask_rsqrt_sh(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_rsqrt_sh + // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.sh + return _mm_maskz_rsqrt_sh(__U, __A, __B); +} + +__m128h test_mm_getmant_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_getmant_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh + return _mm_getmant_round_sh(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src, 8); +} + +__m128h test_mm_getmant_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_getmant_sh + // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh + return _mm_getmant_sh(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src); +} + +__m128h test_mm_mask_getmant_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_getmant_sh + // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh + return _mm_mask_getmant_sh(__W, __U, __A, __B, 1, 2); +} + +__m128h test_mm_mask_getmant_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_getmant_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh + return _mm_mask_getmant_round_sh(__W, __U, __A, __B, 1, 2, _MM_FROUND_NO_EXC); +} + +__m128h test_mm_maskz_getmant_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_getmant_sh + // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh + return _mm_maskz_getmant_sh(__U, __A, __B, 1, 2); +} + +__m128h test_mm_maskz_getmant_round_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_getmant_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh + return _mm_maskz_getmant_round_sh(__U, __A, __B, 1, 2, _MM_FROUND_NO_EXC); +} + +__m128h test_mm_getexp_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_getexp_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh + return _mm_getexp_round_sh(__A, __B, 8); +} + +__m128h test_mm_getexp_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_getexp_sh + // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh + return _mm_getexp_sh(__A, __B); +} + +__m128h test_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_getexp_sh + // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh + return _mm_mask_getexp_sh(__W, __U, __A, __B); +} + +__m128h test_mm_mask_getexp_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_getexp_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh + return _mm_mask_getexp_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC); +} + +__m128h test_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_getexp_sh + // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh + return _mm_maskz_getexp_sh(__U, __A, __B); +} + +__m128h test_mm_maskz_getexp_round_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_getexp_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh + return _mm_maskz_getexp_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC); +} + +__m128h test_mm_scalef_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_scalef_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 -1, i32 11) + return _mm_scalef_round_sh(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_scalef_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_scalef_sh + // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh + return _mm_scalef_sh(__A, __B); +} + +__m128h test_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_scalef_sh + // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh + return _mm_mask_scalef_sh(__W, __U, __A, __B); +} + +__m128h test_mm_mask_scalef_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_scalef_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 %{{.*}}, i32 11) + return _mm_mask_scalef_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_scalef_sh + // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh + return _mm_maskz_scalef_sh(__U, __A, __B); +} + +__m128h test_mm_maskz_scalef_round_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_scalef_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 %{{.*}}, i32 11) + return _mm_maskz_scalef_round_sh(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_roundscale_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_roundscale_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh + return _mm_roundscale_round_sh(__A, __B, 3, _MM_FROUND_NO_EXC); +} + +__m128h test_mm_roundscale_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_roundscale_sh + // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh + return _mm_roundscale_sh(__A, __B, 3); +} + +__m128h test_mm_mask_roundscale_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_roundscale_sh + // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh + return _mm_mask_roundscale_sh(__W, __U, __A, __B, 3); +} + +__m128h test_mm_mask_roundscale_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_roundscale_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh + return _mm_mask_roundscale_round_sh(__W, __U, __A, __B, 3, _MM_FROUND_NO_EXC); +} + +__m128h test_mm_maskz_roundscale_round_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_roundscale_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh + return _mm_maskz_roundscale_round_sh(__U, __A, __B, 3, _MM_FROUND_NO_EXC); +} + +__m128h test_mm_maskz_roundscale_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_roundscale_sh + // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh + return _mm_maskz_roundscale_sh(__U, __A, __B, 3); +} + +__m128h test_mm_reduce_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_reduce_sh + // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh + return _mm_reduce_sh(__A, __B, 4); +} + +__m128h test_mm_mask_reduce_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_reduce_sh + // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh + return _mm_mask_reduce_sh(__W, __U, __A, __B, 4); +} + +__m128h test_mm_maskz_reduce_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_reduce_sh + // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh + return _mm_maskz_reduce_sh(__U, __A, __B, 4); +} + +__m128h test_mm_reduce_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_reduce_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh + return _mm_reduce_round_sh(__A, __B, 4, 8); +} + +__m128h test_mm_mask_reduce_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_reduce_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh + return _mm_mask_reduce_round_sh(__W, __U, __A, __B, 4, 8); +} + +__m128h test_mm_maskz_reduce_round_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_reduce_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh + return _mm_maskz_reduce_round_sh(__U, __A, __B, 4, 8); +} + +__m512h test_mm512_sqrt_round_ph(__m512h __A) { + // CHECK-LABEL: @test_mm512_sqrt_round_ph + // CHECK: call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %{{.*}}, i32 11) + return _mm512_sqrt_round_ph(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_sqrt_round_ph(__m512h __W, __mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_mask_sqrt_round_ph + // CHECK: call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %{{.*}}, i32 11) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_sqrt_round_ph(__W, __U, __A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_sqrt_round_ph(__mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_maskz_sqrt_round_ph + // CHECK: call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %{{.*}}, i32 11) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> {{.*}} + return _mm512_maskz_sqrt_round_ph(__U, __A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_sqrt_ph(__m512h __A) { + // CHECK-LABEL: @test_mm512_sqrt_ph + // CHECK: %{{.*}} = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %{{.*}}) + return _mm512_sqrt_ph(__A); +} +__m512h test_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_mask_sqrt_ph + // CHECK: %{{.*}} = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %{{.*}}) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_sqrt_ph(__W, __U, __A); +} +__m512h test_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_maskz_sqrt_ph + // CHECK: %{{.*}} = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %{{.*}}) + // CHECK: bitcast i32 %{{.*}} to <32 x i1> + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> {{.*}} + return _mm512_maskz_sqrt_ph(__U, __A); +} + +__m128h test_mm_sqrt_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_sqrt_round_sh + // CHECK: call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 -1, i32 11) + return _mm_sqrt_round_sh(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_mask_sqrt_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_sqrt_round_sh + // CHECK: call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 {{.*}}, i32 11) + return _mm_mask_sqrt_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_maskz_sqrt_round_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_sqrt_round_sh + // CHECK: call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 {{.*}}, i32 11) + return _mm_maskz_sqrt_round_sh(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m128h test_mm_sqrt_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_sqrt_sh + // CHECK: %{{.*}} = call half @llvm.sqrt.f16(half %{{.*}}) + return _mm_sqrt_sh(__A, __B); +} +__m128h test_mm_mask_sqrt_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_sqrt_sh + // CHECK: %{{.*}} = call half @llvm.sqrt.f16(half %{{.*}}) + return _mm_mask_sqrt_sh(__W, __U, __A, __B); +} +__m128h test_mm_maskz_sqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_sqrt_sh + // CHECK: %{{.*}} = call half @llvm.sqrt.f16(half %{{.*}}) + return _mm_maskz_sqrt_sh(__U, __A, __B); +} + +__mmask32 test_mm512_mask_fpclass_ph_mask(__mmask32 __U, __m512h __A) { + // CHECK-LABEL: @test_mm512_mask_fpclass_ph_mask + // CHECK: @llvm.x86.avx512fp16.fpclass.ph.512 + return _mm512_mask_fpclass_ph_mask(__U, __A, 4); +} + +__mmask32 test_mm512_fpclass_ph_mask(__m512h __A) { + // CHECK-LABEL: @test_mm512_fpclass_ph_mask + // CHECK: @llvm.x86.avx512fp16.fpclass.ph.512 + return _mm512_fpclass_ph_mask(__A, 4); +} + +__mmask8 test_mm_fpclash_sh_mask(__m128 __A) { + // CHECK-LABEL: @test_mm_fpclash_sh_mask + // CHECK: @llvm.x86.avx512fp16.mask.fpclass.sh + return _mm_fpclass_sh_mask(__A, 2); +} + +__mmask8 test_mm_mask_fpclash_sh_mask(__mmask8 __U, __m128 __A) { + // CHECK-LABEL: @test_mm_mask_fpclash_sh_mask + // CHECK: @llvm.x86.avx512fp16.mask.fpclass.sh + return _mm_mask_fpclass_sh_mask(__U, __A, 2); +} + __m128h test_mm512_cvt_roundpd_ph(__m512d A) { // CHECK-LABEL: test_mm512_cvt_roundpd_ph // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512 diff --git a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c index 0d020ccd1452f9..cb99d655f21c63 100644 --- a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c @@ -1215,6 +1215,320 @@ __mmask8 test_mm_mask_cmp_ph_mask_true_us(__mmask8 m, __m128h a, __m128h b) { return _mm_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_US); } +__m256h test_mm256_rcp_ph(__m256h __A) { + // CHECK-LABEL: @test_mm256_rcp_ph + // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.256 + return _mm256_rcp_ph(__A); +} + +__m256h test_mm256_mask_rcp_ph(__m256h __W, __mmask32 __U, __m256h __A) { + // CHECK-LABEL: @test_mm256_mask_rcp_ph + // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.256 + return (__m256h)_mm256_mask_rcp_ph(__W, __U, __A); +} + +__m256h test_mm256_maskz_rcp_ph(__mmask32 __U, __m256h __A) { + // CHECK-LABEL: @test_mm256_maskz_rcp_ph + // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.256 + return _mm256_maskz_rcp_ph(__U, __A); +} + +__m128h test_mm_rcp_ph(__m128h __A) { + // CHECK-LABEL: @test_mm_rcp_ph + // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.128 + return _mm_rcp_ph(__A); +} + +__m128h test_mm_mask_rcp_ph(__m128h __W, __mmask32 __U, __m128h __A) { + // CHECK-LABEL: @test_mm_mask_rcp_ph + // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.128 + return (__m128h)_mm_mask_rcp_ph(__W, __U, __A); +} + +__m128h test_mm_maskz_rcp_ph(__mmask32 __U, __m128h __A) { + // CHECK-LABEL: @test_mm_maskz_rcp_ph + // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.128 + return _mm_maskz_rcp_ph(__U, __A); +} + +__m256h test_mm256_rsqrt_ph(__m256h __A) { + // CHECK-LABEL: @test_mm256_rsqrt_ph + // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.256 + return _mm256_rsqrt_ph(__A); +} + +__m256h test_mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) { + // CHECK-LABEL: @test_mm256_mask_rsqrt_ph + // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.256 + return (__m256h)_mm256_mask_rsqrt_ph(__W, __U, __A); +} + +__m256h test_mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) { + // CHECK-LABEL: @test_mm256_maskz_rsqrt_ph + // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.256 + return _mm256_maskz_rsqrt_ph(__U, __A); +} + +__m128h test_mm_rsqrt_ph(__m128h __A) { + // CHECK-LABEL: @test_mm_rsqrt_ph + // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.128 + return _mm_rsqrt_ph(__A); +} + +__m128h test_mm_mask_rsqrt_ph(__m128h __W, __mmask32 __U, __m128h __A) { + // CHECK-LABEL: @test_mm_mask_rsqrt_ph + // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.128 + return (__m128h)_mm_mask_rsqrt_ph(__W, __U, __A); +} + +__m128h test_mm_maskz_rsqrt_ph(__mmask32 __U, __m128h __A) { + // CHECK-LABEL: @test_mm_maskz_rsqrt_ph + // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.128 + return _mm_maskz_rsqrt_ph(__U, __A); +} + +__m128h test_mm_getmant_ph(__m128h __A) { + // CHECK-LABEL: @test_mm_getmant_ph + // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.128 + return _mm_getmant_ph(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); +} + +__m128h test_mm_mask_getmant_ph(__m128h __W, __mmask8 __U, __m128h __A) { + // CHECK-LABEL: @test_mm_mask_getmant_ph + // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.128 + return _mm_mask_getmant_ph(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); +} + +__m128h test_mm_maskz_getmant_ph(__mmask8 __U, __m128h __A) { + // CHECK-LABEL: @test_mm_maskz_getmant_ph + // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.128 + return _mm_maskz_getmant_ph(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); +} + +__m256h test_mm256_getmant_ph(__m256h __A) { + // CHECK-LABEL: @test_mm256_getmant_ph + // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.256 + return _mm256_getmant_ph(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); +} + +__m256h test_mm256_mask_getmant_ph(__m256h __W, __mmask16 __U, __m256h __A) { + // CHECK-LABEL: @test_mm256_mask_getmant_ph + // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.256 + return _mm256_mask_getmant_ph(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); +} + +__m256h test_mm256_maskz_getmant_ph(__mmask16 __U, __m256h __A) { + // CHECK-LABEL: @test_mm256_maskz_getmant_ph + // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.256 + return _mm256_maskz_getmant_ph(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); +} + +__m128h test_mm_getexp_ph(__m128h __A) { + // CHECK-LABEL: @test_mm_getexp_ph + // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.128 + return _mm_getexp_ph(__A); +} + +__m128h test_mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) { + // CHECK-LABEL: @test_mm_mask_getexp_ph + // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.128 + return _mm_mask_getexp_ph(__W, __U, __A); +} + +__m128h test_mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) { + // CHECK-LABEL: @test_mm_maskz_getexp_ph + // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.128 + return _mm_maskz_getexp_ph(__U, __A); +} + +__m256h test_mm256_getexp_ph(__m256h __A) { + // CHECK-LABEL: @test_mm256_getexp_ph + // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.256 + return _mm256_getexp_ph(__A); +} + +__m256h test_mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) { + // CHECK-LABEL: @test_mm256_mask_getexp_ph + // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.256 + return _mm256_mask_getexp_ph(__W, __U, __A); +} + +__m256h test_mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) { + // CHECK-LABEL: @test_mm256_maskz_getexp_ph + // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.256 + return _mm256_maskz_getexp_ph(__U, __A); +} + +__m128h test_mm_scalef_ph(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_scalef_ph + // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.128 + return _mm_scalef_ph(__A, __B); +} + +__m128h test_mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_scalef_ph + // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.128 + return _mm_mask_scalef_ph(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_scalef_ph + // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.128 + return _mm_maskz_scalef_ph(__U, __A, __B); +} + +__m256h test_mm256_scalef_ph(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_scalef_ph + // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.256 + return _mm256_scalef_ph(__A, __B); +} + +__m256h test_mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_scalef_ph + // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.256 + return _mm256_mask_scalef_ph(__W, __U, __A, __B); +} + +__m256h test_mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_scalef_ph + // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.256 + return _mm256_maskz_scalef_ph(__U, __A, __B); +} + +__m128h test_mm_roundscale_ph(__m128h __A) { + // CHECK-LABEL: @test_mm_roundscale_ph + // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.128 + return _mm_roundscale_ph(__A, 4); +} + +__m128h test_mm_mask_roundscale_ph(__m128h __W, __mmask8 __U, __m128h __A) { + // CHECK-LABEL: @test_mm_mask_roundscale_ph + // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.128 + return _mm_mask_roundscale_ph(__W, __U, __A, 4); +} + +__m128h test_mm_maskz_roundscale_ph(__mmask8 __U, __m128h __A) { + // CHECK-LABEL: @test_mm_maskz_roundscale_ph + // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.128 + return _mm_maskz_roundscale_ph(__U, __A, 4); +} + +__m256h test_mm256_roundscale_ph(__m256h __A) { + // CHECK-LABEL: @test_mm256_roundscale_ph + // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.256 + return _mm256_roundscale_ph(__A, 4); +} + +__m256h test_mm256_mask_roundscale_ph(__m256h __W, __mmask16 __U, __m256h __A) { + // CHECK-LABEL: @test_mm256_mask_roundscale_ph + // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.256 + return _mm256_mask_roundscale_ph(__W, __U, __A, 4); +} + +__m256h test_mm256_maskz_roundscale_ph(__mmask16 __U, __m256h __A) { + // CHECK-LABEL: @test_mm256_maskz_roundscale_ph + // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.256 + return _mm256_maskz_roundscale_ph(__U, __A, 4); +} + +__m128h test_mm_reduce_ph(__m128h __A) { + // CHECK-LABEL: @test_mm_reduce_ph + // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.128 + return _mm_reduce_ph(__A, 4); +} + +__m128h test_mm_mask_reduce_ph(__m128h __W, __mmask8 __U, __m128h __A) { + // CHECK-LABEL: @test_mm_mask_reduce_ph + // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.128 + return _mm_mask_reduce_ph(__W, __U, __A, 4); +} + +__m128h test_mm_maskz_reduce_ph(__mmask8 __U, __m128h __A) { + // CHECK-LABEL: @test_mm_maskz_reduce_ph + // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.128 + return _mm_maskz_reduce_ph(__U, __A, 4); +} + +__m256h test_mm256_reduce_ph(__m256h __A) { + // CHECK-LABEL: @test_mm256_reduce_ph + // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.256 + return _mm256_reduce_ph(__A, 4); +} + +__m256h test_mm256_mask_reduce_ph(__m256h __W, __mmask16 __U, __m256h __A) { + // CHECK-LABEL: @test_mm256_mask_reduce_ph + // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.256 + return _mm256_mask_reduce_ph(__W, __U, __A, 4); +} + +__m256h test_mm256_maskz_reduce_ph(__mmask16 __U, __m256h __A) { + // CHECK-LABEL: @test_mm256_maskz_reduce_ph + // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.256 + return _mm256_maskz_reduce_ph(__U, __A, 4); +} +__m128h test_mm_sqrt_ph(__m128h x) { + // CHECK-LABEL: test_mm_sqrt_ph + // CHECK: call <8 x half> @llvm.sqrt.v8f16(<8 x half> {{.*}}) + return _mm_sqrt_ph(x); +} + +__m256h test_mm256_sqrt_ph(__m256h A) { + // CHECK-LABEL: test_mm256_sqrt_ph + // CHECK: call <16 x half> @llvm.sqrt.v16f16(<16 x half> %{{.*}}) + return _mm256_sqrt_ph(A); +} + +__m128h test_mm_mask_sqrt_ph(__m128h __W, __mmask8 __U, __m128h __A) { + // CHECK-LABEL: @test_mm_mask_sqrt_ph + // CHECK: @llvm.sqrt.v8f16 + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_mask_sqrt_ph(__W, __U, __A); +} + +__m128h test_mm_maskz_sqrt_ph(__mmask8 __U, __m128h __A) { + // CHECK-LABEL: @test_mm_maskz_sqrt_ph + // CHECK: @llvm.sqrt.v8f16 + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_maskz_sqrt_ph(__U, __A); +} + +__m256h test_mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) { + // CHECK-LABEL: @test_mm256_mask_sqrt_ph + // CHECK: @llvm.sqrt.v16f16 + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_mask_sqrt_ph(__W, __U, __A); +} + +__m256h test_mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) { + // CHECK-LABEL: @test_mm256_maskz_sqrt_ph + // CHECK: @llvm.sqrt.v16f16 + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_maskz_sqrt_ph(__U, __A); +} +__mmask8 test_mm_mask_fpclass_ph_mask(__mmask8 __U, __m128h __A) { + // CHECK-LABEL: @test_mm_mask_fpclass_ph_mask + // CHECK: @llvm.x86.avx512fp16.fpclass.ph.128 + return _mm_mask_fpclass_ph_mask(__U, __A, 2); +} + +__mmask8 test_mm_fpclass_ph_mask(__m128h __A) { + // CHECK-LABEL: @test_mm_fpclass_ph_mask + // CHECK: @llvm.x86.avx512fp16.fpclass.ph.128 + return _mm_fpclass_ph_mask(__A, 2); +} + +__mmask16 test_mm256_mask_fpclass_ph_mask(__mmask16 __U, __m256h __A) { + // CHECK-LABEL: @test_mm256_mask_fpclass_ph_mask + // CHECK: @llvm.x86.avx512fp16.fpclass.ph.256 + return _mm256_mask_fpclass_ph_mask(__U, __A, 2); +} + +__mmask16 test_mm256_fpclass_ph_mask(__m256h __A) { + // CHECK-LABEL: @test_mm256_fpclass_ph_mask + // CHECK: @llvm.x86.avx512fp16.fpclass.ph.256 + return _mm256_fpclass_ph_mask(__A, 2); +} + __m128h test_mm_cvtpd_ph(__m128d A) { // CHECK-LABEL: test_mm_cvtpd_ph // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.128 diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 72e9c3404775dd..c79c6118db680e 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5541,4 +5541,172 @@ let TargetPrefix = "x86" in { : GCCBuiltin<"__builtin_ia32_vcvttsh2usi64">, Intrinsic<[ llvm_i64_ty ], [ llvm_v8f16_ty, llvm_i32_ty ], [ IntrNoMem, ImmArg> ]>; + + def int_x86_avx512fp16_sqrt_ph_512 + : Intrinsic<[ llvm_v32f16_ty ], [ llvm_v32f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_sqrt_sh + : Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_rsqrt_ph_128 + : GCCBuiltin<"__builtin_ia32_rsqrtph128_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_rsqrt_ph_256 + : GCCBuiltin<"__builtin_ia32_rsqrtph256_mask">, + Intrinsic<[ llvm_v16f16_ty ], + [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_rsqrt_ph_512 + : GCCBuiltin<"__builtin_ia32_rsqrtph512_mask">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_rsqrt_sh + : GCCBuiltin<"__builtin_ia32_rsqrtsh_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_rcp_ph_128 + : GCCBuiltin<"__builtin_ia32_rcpph128_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_rcp_ph_256 + : GCCBuiltin<"__builtin_ia32_rcpph256_mask">, + Intrinsic<[ llvm_v16f16_ty ], + [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_rcp_ph_512 + : GCCBuiltin<"__builtin_ia32_rcpph512_mask">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_rcp_sh + : GCCBuiltin<"__builtin_ia32_rcpsh_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_reduce_ph_128 + : GCCBuiltin<"__builtin_ia32_reduceph128_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty, llvm_i8_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_reduce_ph_256 + : GCCBuiltin<"__builtin_ia32_reduceph256_mask">, + Intrinsic<[ llvm_v16f16_ty ], + [ llvm_v16f16_ty, llvm_i32_ty, llvm_v16f16_ty, llvm_i16_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_reduce_ph_512 + : GCCBuiltin<"__builtin_ia32_reduceph512_mask">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_i32_ty, llvm_v32f16_ty, llvm_i32_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg>, ImmArg> ]>; + def int_x86_avx512fp16_mask_reduce_sh + : GCCBuiltin<"__builtin_ia32_reducesh_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg>, ImmArg> ]>; + def int_x86_avx512fp16_fpclass_ph_128 + : Intrinsic<[ llvm_v8i1_ty ], [ llvm_v8f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_fpclass_ph_256 + : Intrinsic<[ llvm_v16i1_ty ], [ llvm_v16f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_fpclass_ph_512 + : Intrinsic<[ llvm_v32i1_ty ], [ llvm_v32f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_fpclass_sh + : GCCBuiltin<"__builtin_ia32_fpclasssh_mask">, + Intrinsic<[ llvm_i8_ty ], [ llvm_v8f16_ty, llvm_i32_ty, llvm_i8_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_getexp_ph_128 + : GCCBuiltin<"__builtin_ia32_getexpph128_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_getexp_ph_256 + : GCCBuiltin<"__builtin_ia32_getexpph256_mask">, + Intrinsic<[ llvm_v16f16_ty ], + [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_getexp_ph_512 + : GCCBuiltin<"__builtin_ia32_getexpph512_mask">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_getexp_sh + : GCCBuiltin<"__builtin_ia32_getexpsh128_round_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_getmant_ph_128 + : GCCBuiltin<"__builtin_ia32_getmantph128_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty, llvm_i8_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_getmant_ph_256 + : GCCBuiltin<"__builtin_ia32_getmantph256_mask">, + Intrinsic<[ llvm_v16f16_ty ], + [ llvm_v16f16_ty, llvm_i32_ty, llvm_v16f16_ty, llvm_i16_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_getmant_ph_512 + : GCCBuiltin<"__builtin_ia32_getmantph512_mask">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_i32_ty, llvm_v32f16_ty, llvm_i32_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg>, ImmArg> ]>; + def int_x86_avx512fp16_mask_getmant_sh + : GCCBuiltin<"__builtin_ia32_getmantsh_round_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty, + llvm_i8_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg>, ImmArg> ]>; + def int_x86_avx512fp16_mask_rndscale_ph_128 + : GCCBuiltin<"__builtin_ia32_rndscaleph_128_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty, llvm_i8_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_rndscale_ph_256 + : GCCBuiltin<"__builtin_ia32_rndscaleph_256_mask">, + Intrinsic<[ llvm_v16f16_ty ], + [ llvm_v16f16_ty, llvm_i32_ty, llvm_v16f16_ty, llvm_i16_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_rndscale_ph_512 + : GCCBuiltin<"__builtin_ia32_rndscaleph_mask">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_i32_ty, llvm_v32f16_ty, llvm_i32_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg>, ImmArg> ]>; + def int_x86_avx512fp16_mask_rndscale_sh + : GCCBuiltin<"__builtin_ia32_rndscalesh_round_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg>, ImmArg> ]>; + def int_x86_avx512fp16_mask_scalef_ph_128 + : GCCBuiltin<"__builtin_ia32_scalefph128_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_scalef_ph_256 + : GCCBuiltin<"__builtin_ia32_scalefph256_mask">, + Intrinsic<[ llvm_v16f16_ty ], + [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ], + [ IntrNoMem ]>; + def int_x86_avx512fp16_mask_scalef_ph_512 + : GCCBuiltin<"__builtin_ia32_scalefph512_mask">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_scalef_sh + : GCCBuiltin<"__builtin_ia32_scalefsh_round_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3595bc57e4d882..d396d5c0df7e07 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1917,6 +1917,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FMUL, VT, Legal); setOperationAction(ISD::FDIV, VT, Legal); setOperationAction(ISD::STRICT_FDIV, VT, Legal); + setOperationAction(ISD::FSQRT, VT, Legal); + setOperationAction(ISD::STRICT_FSQRT, VT, Legal); + + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); + setOperationAction(ISD::STRICT_FCEIL, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); + setOperationAction(ISD::FRINT, VT, Legal); + setOperationAction(ISD::STRICT_FRINT, VT, Legal); + setOperationAction(ISD::FNEARBYINT, VT, Legal); + setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); setOperationAction(ISD::LOAD, VT, Legal); setOperationAction(ISD::STORE, VT, Legal); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 7a2b6ade1796ca..c92abc7e8c95d5 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2816,24 +2816,28 @@ multiclass avx512_vector_fpclass_all opcVec, - bits<8> opcScalar, X86SchedWriteWidths sched, - Predicate prd> { + bits<8> opcScalar, X86SchedWriteWidths sched> { + defm PH : avx512_vector_fpclass_all, + EVEX_CD8<16, CD8VF>, AVX512PSIi8Base, TA; + defm SHZ : avx512_scalar_fpclass, + EVEX_CD8<16, CD8VT1>, AVX512PSIi8Base, TA; defm PS : avx512_vector_fpclass_all, - EVEX_CD8<32, CD8VF>; + sched, HasDQI>, + EVEX_CD8<32, CD8VF>, AVX512AIi8Base; defm PD : avx512_vector_fpclass_all, - EVEX_CD8<64, CD8VF> , VEX_W; + sched, HasDQI>, + EVEX_CD8<64, CD8VF>, AVX512AIi8Base, VEX_W; defm SSZ : avx512_scalar_fpclass, VEX_LIG, - EVEX_CD8<32, CD8VT1>; + sched.Scl, f32x_info, HasDQI>, VEX_LIG, + EVEX_CD8<32, CD8VT1>, AVX512AIi8Base; defm SDZ : avx512_scalar_fpclass, VEX_LIG, - EVEX_CD8<64, CD8VT1>, VEX_W; + sched.Scl, f64x_info, HasDQI>, VEX_LIG, + EVEX_CD8<64, CD8VT1>, AVX512AIi8Base, VEX_W; } -defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, SchedWriteFCmp, - HasDQI>, AVX512AIi8Base, EVEX; +defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, SchedWriteFCmp>, EVEX; //----------------------------------------------------------------- // Mask register copy, including @@ -5957,35 +5961,50 @@ multiclass avx512_fp_scalef_scalar opc, string OpcodeStr, SDNode OpNode, multiclass avx512_fp_scalef_all opc, bits<8> opcScaler, string OpcodeStr, X86SchedWriteWidths sched> { + let Predicates = [HasFP16] in { + defm PHZ : avx512_fp_scalef_p, + avx512_fp_round_packed, + EVEX_V512, T_MAP6PD, EVEX_CD8<16, CD8VF>; + defm SHZ : avx512_fp_scalef_scalar, + avx512_fp_scalar_round, + EVEX_4V, T_MAP6PD, EVEX_CD8<16, CD8VT1>; + } defm PSZ : avx512_fp_scalef_p, avx512_fp_round_packed, - EVEX_V512, EVEX_CD8<32, CD8VF>; + EVEX_V512, EVEX_CD8<32, CD8VF>, T8PD; defm PDZ : avx512_fp_scalef_p, avx512_fp_round_packed, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>, T8PD; defm SSZ : avx512_fp_scalef_scalar, avx512_fp_scalar_round, - EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; + EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, T8PD; defm SDZ : avx512_fp_scalef_scalar, avx512_fp_scalar_round, - EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, VEX_W; + EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, VEX_W, T8PD; // Define only if AVX512VL feature is present. let Predicates = [HasVLX] in { defm PSZ128 : avx512_fp_scalef_p, - EVEX_V128, EVEX_CD8<32, CD8VF>; + EVEX_V128, EVEX_CD8<32, CD8VF>, T8PD; defm PSZ256 : avx512_fp_scalef_p, - EVEX_V256, EVEX_CD8<32, CD8VF>; + EVEX_V256, EVEX_CD8<32, CD8VF>, T8PD; defm PDZ128 : avx512_fp_scalef_p, - EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>; + EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>, T8PD; defm PDZ256 : avx512_fp_scalef_p, - EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; + EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>, T8PD; + } + + let Predicates = [HasFP16, HasVLX] in { + defm PHZ128 : avx512_fp_scalef_p, + EVEX_V128, EVEX_CD8<16, CD8VF>, T_MAP6PD; + defm PHZ256 : avx512_fp_scalef_p, + EVEX_V256, EVEX_CD8<16, CD8VF>, T_MAP6PD; } } defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", - SchedWriteFAdd>, T8PD, NotEVEX2VEXConvertible; + SchedWriteFAdd>, NotEVEX2VEXConvertible; //===----------------------------------------------------------------------===// // AVX-512 VPTESTM instructions @@ -9254,10 +9273,11 @@ let Defs = [EFLAGS], Predicates = [HasFP16] in { } } -/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd +/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd, rcpsh, rsqrtsh multiclass avx512_fp14_s opc, string OpcodeStr, SDNode OpNode, - X86FoldableSchedWrite sched, X86VectorVTInfo _> { - let Predicates = [HasAVX512], ExeDomain = _.ExeDomain, Uses = [MXCSR] in { + X86FoldableSchedWrite sched, X86VectorVTInfo _, + Predicate prd = HasAVX512> { + let Predicates = [prd], ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable_scalar opc, string OpcodeStr, SDNode OpNode, } } +defm VRCPSHZ : avx512_fp14_s<0x4D, "vrcpsh", X86rcp14s, SchedWriteFRcp.Scl, + f16x_info, HasFP16>, EVEX_CD8<16, CD8VT1>, + T_MAP6PD; +defm VRSQRTSHZ : avx512_fp14_s<0x4F, "vrsqrtsh", X86rsqrt14s, + SchedWriteFRsqrt.Scl, f16x_info, HasFP16>, + EVEX_CD8<16, CD8VT1>, T_MAP6PD; +let Uses = [MXCSR] in { defm VRCP14SSZ : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, SchedWriteFRcp.Scl, f32x_info>, EVEX_CD8<32, CD8VT1>, T8PD; @@ -9284,6 +9311,7 @@ defm VRSQRT14SSZ : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s, defm VRSQRT14SDZ : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s, SchedWriteFRsqrt.Scl, f64x_info>, VEX_W, EVEX_CD8<64, CD8VT1>, T8PD; +} /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd multiclass avx512_fp14_p opc, string OpcodeStr, SDNode OpNode, @@ -9307,33 +9335,45 @@ multiclass avx512_fp14_p opc, string OpcodeStr, SDNode OpNode, } } -let Uses = [MXCSR] in multiclass avx512_fp14_p_vl_all opc, string OpcodeStr, SDNode OpNode, X86SchedWriteWidths sched> { - defm PSZ : avx512_fp14_p, EVEX_V512, EVEX_CD8<32, CD8VF>; - defm PDZ : avx512_fp14_p, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + let Uses = [MXCSR] in { + defm 14PSZ : avx512_fp14_p, EVEX_V512, EVEX_CD8<32, CD8VF>; + defm 14PDZ : avx512_fp14_p, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + } + let Predicates = [HasFP16] in + defm PHZ : avx512_fp14_p, EVEX_V512, T_MAP6PD, EVEX_CD8<16, CD8VF>; // Define only if AVX512VL feature is present. - let Predicates = [HasVLX] in { - defm PSZ128 : avx512_fp14_p, - EVEX_V128, EVEX_CD8<32, CD8VF>; - defm PSZ256 : avx512_fp14_p, - EVEX_V256, EVEX_CD8<32, CD8VF>; - defm PDZ128 : avx512_fp14_p, - EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>; - defm PDZ256 : avx512_fp14_p, - EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; + let Predicates = [HasVLX], Uses = [MXCSR] in { + defm 14PSZ128 : avx512_fp14_p, + EVEX_V128, EVEX_CD8<32, CD8VF>; + defm 14PSZ256 : avx512_fp14_p, + EVEX_V256, EVEX_CD8<32, CD8VF>; + defm 14PDZ128 : avx512_fp14_p, + EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>; + defm 14PDZ256 : avx512_fp14_p, + EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; + } + let Predicates = [HasFP16, HasVLX] in { + defm PHZ128 : avx512_fp14_p, + EVEX_V128, T_MAP6PD, EVEX_CD8<16, CD8VF>; + defm PHZ256 : avx512_fp14_p, + EVEX_V256, T_MAP6PD, EVEX_CD8<16, CD8VF>; } } -defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86rsqrt14, SchedWriteFRsqrt>; -defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SchedWriteFRcp>; +defm VRSQRT : avx512_fp14_p_vl_all<0x4E, "vrsqrt", X86rsqrt14, SchedWriteFRsqrt>; +defm VRCP : avx512_fp14_p_vl_all<0x4C, "vrcp", X86rcp14, SchedWriteFRcp>; /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd multiclass avx512_fp28_s opc, string OpcodeStr,X86VectorVTInfo _, @@ -9363,20 +9403,29 @@ multiclass avx512_fp28_s opc, string OpcodeStr,X86VectorVTInfo _, multiclass avx512_eri_s opc, string OpcodeStr, SDNode OpNode, SDNode OpNodeSAE, X86FoldableSchedWrite sched> { defm SSZ : avx512_fp28_s, EVEX_CD8<32, CD8VT1>, VEX_LIG; + sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG, T8PD, EVEX_4V; defm SDZ : avx512_fp28_s, EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W; + sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W, T8PD, EVEX_4V; +} + +multiclass avx512_vgetexpsh opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeSAE, X86FoldableSchedWrite sched> { + let Predicates = [HasFP16] in + defm SHZ : avx512_fp28_s, + EVEX_CD8<16, CD8VT1>, T_MAP6PD, EVEX_4V; } let Predicates = [HasERI] in { defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, X86rcp28SAEs, - SchedWriteFRcp.Scl>, T8PD, EVEX_4V; + SchedWriteFRcp.Scl>; defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, X86rsqrt28SAEs, - SchedWriteFRsqrt.Scl>, T8PD, EVEX_4V; + SchedWriteFRsqrt.Scl>; } defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs, - SchedWriteFRnd.Scl>, T8PD, EVEX_4V; + SchedWriteFRnd.Scl>, + avx512_vgetexpsh<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs, + SchedWriteFRnd.Scl>; /// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd multiclass avx512_fp28_p opc, string OpcodeStr, X86VectorVTInfo _, @@ -9440,6 +9489,19 @@ multiclass avx512_fp_unaryop_packed opc, string OpcodeStr, } } +multiclass avx512_vgetexp_fp16 opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeSAE, X86SchedWriteWidths sched> { + let Predicates = [HasFP16] in + defm PHZ : avx512_fp28_p, + avx512_fp28_p_sae, + T_MAP6PD, EVEX_V512, EVEX_CD8<16, CD8VF>; + let Predicates = [HasFP16, HasVLX] in { + defm PHZ128 : avx512_fp28_p, + EVEX_V128, T_MAP6PD, EVEX_CD8<16, CD8VF>; + defm PHZ256 : avx512_fp28_p, + EVEX_V256, T_MAP6PD, EVEX_CD8<16, CD8VF>; + } +} let Predicates = [HasERI] in { defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, X86rsqrt28SAE, SchedWriteFRsqrt>, EVEX; @@ -9450,6 +9512,8 @@ let Predicates = [HasERI] in { } defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE, SchedWriteFRnd>, + avx512_vgetexp_fp16<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE, + SchedWriteFRnd>, avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexp, SchedWriteFRnd>, EVEX; @@ -9487,6 +9551,18 @@ multiclass avx512_sqrt_packed opc, string OpcodeStr, let Uses = [MXCSR], mayRaiseFPException = 1 in multiclass avx512_sqrt_packed_all opc, string OpcodeStr, X86SchedWriteSizes sched> { + let Predicates = [HasFP16] in + defm PHZ : avx512_sqrt_packed, + EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>; + let Predicates = [HasFP16, HasVLX] in { + defm PHZ128 : avx512_sqrt_packed, + EVEX_V128, T_MAP5PS, EVEX_CD8<16, CD8VF>; + defm PHZ256 : avx512_sqrt_packed, + EVEX_V256, T_MAP5PS, EVEX_CD8<16, CD8VF>; + } defm PSZ : avx512_sqrt_packed, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; @@ -9513,6 +9589,10 @@ multiclass avx512_sqrt_packed_all opc, string OpcodeStr, let Uses = [MXCSR] in multiclass avx512_sqrt_packed_all_round opc, string OpcodeStr, X86SchedWriteSizes sched> { + let Predicates = [HasFP16] in + defm PHZ : avx512_sqrt_packed_round, + EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>; defm PSZ : avx512_sqrt_packed_round, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; @@ -9522,8 +9602,8 @@ multiclass avx512_sqrt_packed_all_round opc, string OpcodeStr, } multiclass avx512_sqrt_scalar opc, string OpcodeStr, X86FoldableSchedWrite sched, - X86VectorVTInfo _, string Name> { - let ExeDomain = _.ExeDomain in { + X86VectorVTInfo _, string Name, Predicate prd = HasAVX512> { + let ExeDomain = _.ExeDomain, Predicates = [prd] in { defm r_Int : AVX512_maskable_scalar opc, string OpcodeStr, X86FoldableSchedWri (i32 timm:$rc))>, EVEX_B, EVEX_RC, Sched<[sched]>; - let isCodeGenOnly = 1, hasSideEffects = 0, Predicates=[HasAVX512] in { + let isCodeGenOnly = 1, hasSideEffects = 0 in { def r : I, @@ -9558,13 +9638,13 @@ multiclass avx512_sqrt_scalar opc, string OpcodeStr, X86FoldableSchedWri } } - let Predicates = [HasAVX512] in { + let Predicates = [prd] in { def : Pat<(_.EltVT (any_fsqrt _.FRC:$src)), (!cast(Name#Zr) (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>; } - let Predicates = [HasAVX512, OptForSize] in { + let Predicates = [prd, OptForSize] in { def : Pat<(_.EltVT (any_fsqrt (load addr:$src))), (!cast(Name#Zm) (_.EltVT (IMPLICIT_DEF)), addr:$src)>; @@ -9573,6 +9653,8 @@ multiclass avx512_sqrt_scalar opc, string OpcodeStr, X86FoldableSchedWri multiclass avx512_sqrt_scalar_all opc, string OpcodeStr, X86SchedWriteSizes sched> { + defm SHZ : avx512_sqrt_scalar, + EVEX_CD8<16, CD8VT1>, EVEX_4V, T_MAP5XS; defm SSZ : avx512_sqrt_scalar, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS; defm SDZ : avx512_sqrt_scalar, @@ -9637,6 +9719,12 @@ multiclass avx512_rndscale_scalar opc, string OpcodeStr, } } +let Predicates = [HasFP16] in +defm VRNDSCALESHZ : avx512_rndscale_scalar<0x0A, "vrndscalesh", + SchedWriteFRnd.Scl, f16x_info>, + AVX512PSIi8Base, TA, EVEX_4V, + EVEX_CD8<16, CD8VT1>; + defm VRNDSCALESSZ : avx512_rndscale_scalar<0x0A, "vrndscaless", SchedWriteFRnd.Scl, f32x_info>, AVX512AIi8Base, EVEX_4V, VEX_LIG, @@ -9665,6 +9753,9 @@ multiclass avx512_masked_scalar; defm : avx512_masked_scalar; @@ -10883,24 +10974,26 @@ multiclass avx512_common_unary_fp_sae_packed_imm_all opcPs, bits<8> opcPd, SDPatternOperator OpNode, SDPatternOperator MaskOpNode, SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{ + defm PH : avx512_common_unary_fp_sae_packed_imm, + AVX512PSIi8Base, TA, EVEX, EVEX_CD8<16, CD8VF>; defm PS : avx512_common_unary_fp_sae_packed_imm, - EVEX_CD8<32, CD8VF>; + AVX512AIi8Base, EVEX, EVEX_CD8<32, CD8VF>; defm PD : avx512_common_unary_fp_sae_packed_imm, - EVEX_CD8<64, CD8VF>, VEX_W; + AVX512AIi8Base, EVEX, EVEX_CD8<64, CD8VF>, VEX_W; } defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56, X86VReduce, X86VReduce, X86VReduceSAE, - SchedWriteFRnd, HasDQI>, AVX512AIi8Base, EVEX; + SchedWriteFRnd, HasDQI>; defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09, X86any_VRndScale, X86VRndScale, X86VRndScaleSAE, - SchedWriteFRnd, HasAVX512>, - AVX512AIi8Base, EVEX; + SchedWriteFRnd, HasAVX512>; defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26, X86VGetMant, X86VGetMant, X86VGetMantSAE, - SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, EVEX; + SchedWriteFRnd, HasAVX512>; defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info, 0x50, X86VRange, X86VRangeSAE, @@ -10924,6 +11017,9 @@ defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info, defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info, 0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; +defm VREDUCESH: avx512_common_fp_sae_scalar_imm<"vreducesh", f16x_info, + 0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasFP16>, + AVX512PSIi8Base, TA, VEX_LIG, EVEX_4V, EVEX_CD8<16, CD8VT1>; defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info, 0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>, @@ -10931,6 +11027,9 @@ defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info, defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info, 0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; +defm VGETMANTSH: avx512_common_fp_sae_scalar_imm<"vgetmantsh", f16x_info, + 0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasFP16>, + AVX512PSIi8Base, TA, VEX_LIG, EVEX_4V, EVEX_CD8<16, CD8VT1>; multiclass avx512_shuff_packed_128_common opc, string OpcodeStr, X86FoldableSchedWrite sched, @@ -12193,6 +12292,7 @@ multiclass AVX512_scalar_unary_math_patterns; defm : AVX512_scalar_unary_math_patterns; +defm : AVX512_scalar_unary_math_patterns; //===----------------------------------------------------------------------===// // AES instructions diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp index 44007b34fcfe29..959c8d4a2d886c 100644 --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -815,10 +815,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::VFPCLASSPDZ128rr, X86::VFPCLASSPDZ128rm, 0 }, { X86::VFPCLASSPDZ256rr, X86::VFPCLASSPDZ256rm, 0 }, { X86::VFPCLASSPDZrr, X86::VFPCLASSPDZrm, 0 }, + { X86::VFPCLASSPHZ128rr, X86::VFPCLASSPHZ128rm, 0 }, + { X86::VFPCLASSPHZ256rr, X86::VFPCLASSPHZ256rm, 0 }, + { X86::VFPCLASSPHZrr, X86::VFPCLASSPHZrm, 0 }, { X86::VFPCLASSPSZ128rr, X86::VFPCLASSPSZ128rm, 0 }, { X86::VFPCLASSPSZ256rr, X86::VFPCLASSPSZ256rm, 0 }, { X86::VFPCLASSPSZrr, X86::VFPCLASSPSZrm, 0 }, { X86::VFPCLASSSDZrr, X86::VFPCLASSSDZrm, TB_NO_REVERSE }, + { X86::VFPCLASSSHZrr, X86::VFPCLASSSHZrm, TB_NO_REVERSE }, { X86::VFPCLASSSSZrr, X86::VFPCLASSSSZrm, TB_NO_REVERSE }, { X86::VFRCZPDYrr, X86::VFRCZPDYrm, 0 }, { X86::VFRCZPDrr, X86::VFRCZPDrm, 0 }, @@ -829,12 +833,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::VGETEXPPDZ128r, X86::VGETEXPPDZ128m, 0 }, { X86::VGETEXPPDZ256r, X86::VGETEXPPDZ256m, 0 }, { X86::VGETEXPPDZr, X86::VGETEXPPDZm, 0 }, + { X86::VGETEXPPHZ128r, X86::VGETEXPPHZ128m, 0 }, + { X86::VGETEXPPHZ256r, X86::VGETEXPPHZ256m, 0 }, + { X86::VGETEXPPHZr, X86::VGETEXPPHZm, 0 }, { X86::VGETEXPPSZ128r, X86::VGETEXPPSZ128m, 0 }, { X86::VGETEXPPSZ256r, X86::VGETEXPPSZ256m, 0 }, { X86::VGETEXPPSZr, X86::VGETEXPPSZm, 0 }, { X86::VGETMANTPDZ128rri, X86::VGETMANTPDZ128rmi, 0 }, { X86::VGETMANTPDZ256rri, X86::VGETMANTPDZ256rmi, 0 }, { X86::VGETMANTPDZrri, X86::VGETMANTPDZrmi, 0 }, + { X86::VGETMANTPHZ128rri, X86::VGETMANTPHZ128rmi, 0 }, + { X86::VGETMANTPHZ256rri, X86::VGETMANTPHZ256rmi, 0 }, + { X86::VGETMANTPHZrri, X86::VGETMANTPHZrmi, 0 }, { X86::VGETMANTPSZ128rri, X86::VGETMANTPSZ128rmi, 0 }, { X86::VGETMANTPSZ256rri, X86::VGETMANTPSZ256rmi, 0 }, { X86::VGETMANTPSZrri, X86::VGETMANTPSZrmi, 0 }, @@ -1161,17 +1171,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::VRCP14PSZr, X86::VRCP14PSZm, 0 }, { X86::VRCP28PDZr, X86::VRCP28PDZm, 0 }, { X86::VRCP28PSZr, X86::VRCP28PSZm, 0 }, + { X86::VRCPPHZ128r, X86::VRCPPHZ128m, 0 }, + { X86::VRCPPHZ256r, X86::VRCPPHZ256m, 0 }, + { X86::VRCPPHZr, X86::VRCPPHZm, 0 }, { X86::VRCPPSYr, X86::VRCPPSYm, 0 }, { X86::VRCPPSr, X86::VRCPPSm, 0 }, { X86::VREDUCEPDZ128rri, X86::VREDUCEPDZ128rmi, 0 }, { X86::VREDUCEPDZ256rri, X86::VREDUCEPDZ256rmi, 0 }, { X86::VREDUCEPDZrri, X86::VREDUCEPDZrmi, 0 }, + { X86::VREDUCEPHZ128rri, X86::VREDUCEPHZ128rmi, 0 }, + { X86::VREDUCEPHZ256rri, X86::VREDUCEPHZ256rmi, 0 }, + { X86::VREDUCEPHZrri, X86::VREDUCEPHZrmi, 0 }, { X86::VREDUCEPSZ128rri, X86::VREDUCEPSZ128rmi, 0 }, { X86::VREDUCEPSZ256rri, X86::VREDUCEPSZ256rmi, 0 }, { X86::VREDUCEPSZrri, X86::VREDUCEPSZrmi, 0 }, { X86::VRNDSCALEPDZ128rri, X86::VRNDSCALEPDZ128rmi, 0 }, { X86::VRNDSCALEPDZ256rri, X86::VRNDSCALEPDZ256rmi, 0 }, { X86::VRNDSCALEPDZrri, X86::VRNDSCALEPDZrmi, 0 }, + { X86::VRNDSCALEPHZ128rri, X86::VRNDSCALEPHZ128rmi, 0 }, + { X86::VRNDSCALEPHZ256rri, X86::VRNDSCALEPHZ256rmi, 0 }, + { X86::VRNDSCALEPHZrri, X86::VRNDSCALEPHZrmi, 0 }, { X86::VRNDSCALEPSZ128rri, X86::VRNDSCALEPSZ128rmi, 0 }, { X86::VRNDSCALEPSZ256rri, X86::VRNDSCALEPSZ256rmi, 0 }, { X86::VRNDSCALEPSZrri, X86::VRNDSCALEPSZrmi, 0 }, @@ -1187,6 +1206,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::VRSQRT14PSZr, X86::VRSQRT14PSZm, 0 }, { X86::VRSQRT28PDZr, X86::VRSQRT28PDZm, 0 }, { X86::VRSQRT28PSZr, X86::VRSQRT28PSZm, 0 }, + { X86::VRSQRTPHZ128r, X86::VRSQRTPHZ128m, 0 }, + { X86::VRSQRTPHZ256r, X86::VRSQRTPHZ256m, 0 }, + { X86::VRSQRTPHZr, X86::VRSQRTPHZm, 0 }, { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 }, { X86::VRSQRTPSr, X86::VRSQRTPSm, 0 }, { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 }, @@ -1194,6 +1216,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::VSQRTPDZ256r, X86::VSQRTPDZ256m, 0 }, { X86::VSQRTPDZr, X86::VSQRTPDZm, 0 }, { X86::VSQRTPDr, X86::VSQRTPDm, 0 }, + { X86::VSQRTPHZ128r, X86::VSQRTPHZ128m, 0 }, + { X86::VSQRTPHZ256r, X86::VSQRTPHZ256m, 0 }, + { X86::VSQRTPHZr, X86::VSQRTPHZm, 0 }, { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 }, { X86::VSQRTPSZ128r, X86::VSQRTPSZ128m, 0 }, { X86::VSQRTPSZ256r, X86::VSQRTPSZ256m, 0 }, @@ -1864,26 +1889,38 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VFPCLASSPDZ128rrk, X86::VFPCLASSPDZ128rmk, 0 }, { X86::VFPCLASSPDZ256rrk, X86::VFPCLASSPDZ256rmk, 0 }, { X86::VFPCLASSPDZrrk, X86::VFPCLASSPDZrmk, 0 }, + { X86::VFPCLASSPHZ128rrk, X86::VFPCLASSPHZ128rmk, 0 }, + { X86::VFPCLASSPHZ256rrk, X86::VFPCLASSPHZ256rmk, 0 }, + { X86::VFPCLASSPHZrrk, X86::VFPCLASSPHZrmk, 0 }, { X86::VFPCLASSPSZ128rrk, X86::VFPCLASSPSZ128rmk, 0 }, { X86::VFPCLASSPSZ256rrk, X86::VFPCLASSPSZ256rmk, 0 }, { X86::VFPCLASSPSZrrk, X86::VFPCLASSPSZrmk, 0 }, { X86::VFPCLASSSDZrrk, X86::VFPCLASSSDZrmk, TB_NO_REVERSE }, + { X86::VFPCLASSSHZrrk, X86::VFPCLASSSHZrmk, TB_NO_REVERSE }, { X86::VFPCLASSSSZrrk, X86::VFPCLASSSSZrmk, TB_NO_REVERSE }, { X86::VGETEXPPDZ128rkz, X86::VGETEXPPDZ128mkz, 0 }, { X86::VGETEXPPDZ256rkz, X86::VGETEXPPDZ256mkz, 0 }, { X86::VGETEXPPDZrkz, X86::VGETEXPPDZmkz, 0 }, + { X86::VGETEXPPHZ128rkz, X86::VGETEXPPHZ128mkz, 0 }, + { X86::VGETEXPPHZ256rkz, X86::VGETEXPPHZ256mkz, 0 }, + { X86::VGETEXPPHZrkz, X86::VGETEXPPHZmkz, 0 }, { X86::VGETEXPPSZ128rkz, X86::VGETEXPPSZ128mkz, 0 }, { X86::VGETEXPPSZ256rkz, X86::VGETEXPPSZ256mkz, 0 }, { X86::VGETEXPPSZrkz, X86::VGETEXPPSZmkz, 0 }, { X86::VGETEXPSDZr, X86::VGETEXPSDZm, TB_NO_REVERSE }, + { X86::VGETEXPSHZr, X86::VGETEXPSHZm, TB_NO_REVERSE }, { X86::VGETEXPSSZr, X86::VGETEXPSSZm, TB_NO_REVERSE }, { X86::VGETMANTPDZ128rrikz, X86::VGETMANTPDZ128rmikz, 0 }, { X86::VGETMANTPDZ256rrikz, X86::VGETMANTPDZ256rmikz, 0 }, { X86::VGETMANTPDZrrikz, X86::VGETMANTPDZrmikz, 0 }, + { X86::VGETMANTPHZ128rrikz, X86::VGETMANTPHZ128rmikz, 0 }, + { X86::VGETMANTPHZ256rrikz, X86::VGETMANTPHZ256rmikz, 0 }, + { X86::VGETMANTPHZrrikz, X86::VGETMANTPHZrmikz, 0 }, { X86::VGETMANTPSZ128rrikz, X86::VGETMANTPSZ128rmikz, 0 }, { X86::VGETMANTPSZ256rrikz, X86::VGETMANTPSZ256rmikz, 0 }, { X86::VGETMANTPSZrrikz, X86::VGETMANTPSZrmikz, 0 }, { X86::VGETMANTSDZrri, X86::VGETMANTSDZrmi, TB_NO_REVERSE }, + { X86::VGETMANTSHZrri, X86::VGETMANTSHZrmi, TB_NO_REVERSE }, { X86::VGETMANTSSZrri, X86::VGETMANTSSZrmi, TB_NO_REVERSE }, { X86::VGF2P8AFFINEINVQBYrri, X86::VGF2P8AFFINEINVQBYrmi, 0 }, { X86::VGF2P8AFFINEINVQBZ128rri, X86::VGF2P8AFFINEINVQBZ128rmi, 0 }, @@ -2899,24 +2936,37 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VRCP28PSZrkz, X86::VRCP28PSZmkz, 0 }, { X86::VRCP28SDZr, X86::VRCP28SDZm, TB_NO_REVERSE }, { X86::VRCP28SSZr, X86::VRCP28SSZm, TB_NO_REVERSE }, + { X86::VRCPPHZ128rkz, X86::VRCPPHZ128mkz, 0 }, + { X86::VRCPPHZ256rkz, X86::VRCPPHZ256mkz, 0 }, + { X86::VRCPPHZrkz, X86::VRCPPHZmkz, 0 }, + { X86::VRCPSHZrr, X86::VRCPSHZrm, TB_NO_REVERSE }, { X86::VRCPSSr, X86::VRCPSSm, 0 }, { X86::VRCPSSr_Int, X86::VRCPSSm_Int, TB_NO_REVERSE }, { X86::VREDUCEPDZ128rrikz, X86::VREDUCEPDZ128rmikz, 0 }, { X86::VREDUCEPDZ256rrikz, X86::VREDUCEPDZ256rmikz, 0 }, { X86::VREDUCEPDZrrikz, X86::VREDUCEPDZrmikz, 0 }, + { X86::VREDUCEPHZ128rrikz, X86::VREDUCEPHZ128rmikz, 0 }, + { X86::VREDUCEPHZ256rrikz, X86::VREDUCEPHZ256rmikz, 0 }, + { X86::VREDUCEPHZrrikz, X86::VREDUCEPHZrmikz, 0 }, { X86::VREDUCEPSZ128rrikz, X86::VREDUCEPSZ128rmikz, 0 }, { X86::VREDUCEPSZ256rrikz, X86::VREDUCEPSZ256rmikz, 0 }, { X86::VREDUCEPSZrrikz, X86::VREDUCEPSZrmikz, 0 }, { X86::VREDUCESDZrri, X86::VREDUCESDZrmi, TB_NO_REVERSE }, + { X86::VREDUCESHZrri, X86::VREDUCESHZrmi, TB_NO_REVERSE }, { X86::VREDUCESSZrri, X86::VREDUCESSZrmi, TB_NO_REVERSE }, { X86::VRNDSCALEPDZ128rrikz, X86::VRNDSCALEPDZ128rmikz, 0 }, { X86::VRNDSCALEPDZ256rrikz, X86::VRNDSCALEPDZ256rmikz, 0 }, { X86::VRNDSCALEPDZrrikz, X86::VRNDSCALEPDZrmikz, 0 }, + { X86::VRNDSCALEPHZ128rrikz, X86::VRNDSCALEPHZ128rmikz, 0 }, + { X86::VRNDSCALEPHZ256rrikz, X86::VRNDSCALEPHZ256rmikz, 0 }, + { X86::VRNDSCALEPHZrrikz, X86::VRNDSCALEPHZrmikz, 0 }, { X86::VRNDSCALEPSZ128rrikz, X86::VRNDSCALEPSZ128rmikz, 0 }, { X86::VRNDSCALEPSZ256rrikz, X86::VRNDSCALEPSZ256rmikz, 0 }, { X86::VRNDSCALEPSZrrikz, X86::VRNDSCALEPSZrmikz, 0 }, { X86::VRNDSCALESDZr, X86::VRNDSCALESDZm, 0 }, { X86::VRNDSCALESDZr_Int, X86::VRNDSCALESDZm_Int, TB_NO_REVERSE }, + { X86::VRNDSCALESHZr, X86::VRNDSCALESHZm, 0 }, + { X86::VRNDSCALESHZr_Int, X86::VRNDSCALESHZm_Int, TB_NO_REVERSE }, { X86::VRNDSCALESSZr, X86::VRNDSCALESSZm, 0 }, { X86::VRNDSCALESSZr_Int, X86::VRNDSCALESSZm_Int, TB_NO_REVERSE }, { X86::VROUNDSDr, X86::VROUNDSDm, 0 }, @@ -2935,15 +2985,23 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VRSQRT28PSZrkz, X86::VRSQRT28PSZmkz, 0 }, { X86::VRSQRT28SDZr, X86::VRSQRT28SDZm, TB_NO_REVERSE }, { X86::VRSQRT28SSZr, X86::VRSQRT28SSZm, TB_NO_REVERSE }, + { X86::VRSQRTPHZ128rkz, X86::VRSQRTPHZ128mkz, 0 }, + { X86::VRSQRTPHZ256rkz, X86::VRSQRTPHZ256mkz, 0 }, + { X86::VRSQRTPHZrkz, X86::VRSQRTPHZmkz, 0 }, + { X86::VRSQRTSHZrr, X86::VRSQRTSHZrm, TB_NO_REVERSE }, { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, TB_NO_REVERSE }, { X86::VSCALEFPDZ128rr, X86::VSCALEFPDZ128rm, 0 }, { X86::VSCALEFPDZ256rr, X86::VSCALEFPDZ256rm, 0 }, { X86::VSCALEFPDZrr, X86::VSCALEFPDZrm, 0 }, + { X86::VSCALEFPHZ128rr, X86::VSCALEFPHZ128rm, 0 }, + { X86::VSCALEFPHZ256rr, X86::VSCALEFPHZ256rm, 0 }, + { X86::VSCALEFPHZrr, X86::VSCALEFPHZrm, 0 }, { X86::VSCALEFPSZ128rr, X86::VSCALEFPSZ128rm, 0 }, { X86::VSCALEFPSZ256rr, X86::VSCALEFPSZ256rm, 0 }, { X86::VSCALEFPSZrr, X86::VSCALEFPSZrm, 0 }, { X86::VSCALEFSDZrr, X86::VSCALEFSDZrm, TB_NO_REVERSE }, + { X86::VSCALEFSHZrr, X86::VSCALEFSHZrm, TB_NO_REVERSE }, { X86::VSCALEFSSZrr, X86::VSCALEFSSZrm, TB_NO_REVERSE }, { X86::VSHUFF32X4Z256rri, X86::VSHUFF32X4Z256rmi, 0 }, { X86::VSHUFF32X4Zrri, X86::VSHUFF32X4Zrmi, 0 }, @@ -2966,6 +3024,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VSQRTPDZ128rkz, X86::VSQRTPDZ128mkz, 0 }, { X86::VSQRTPDZ256rkz, X86::VSQRTPDZ256mkz, 0 }, { X86::VSQRTPDZrkz, X86::VSQRTPDZmkz, 0 }, + { X86::VSQRTPHZ128rkz, X86::VSQRTPHZ128mkz, 0 }, + { X86::VSQRTPHZ256rkz, X86::VSQRTPHZ256mkz, 0 }, + { X86::VSQRTPHZrkz, X86::VSQRTPHZmkz, 0 }, { X86::VSQRTPSZ128rkz, X86::VSQRTPSZ128mkz, 0 }, { X86::VSQRTPSZ256rkz, X86::VSQRTPSZ256mkz, 0 }, { X86::VSQRTPSZrkz, X86::VSQRTPSZmkz, 0 }, @@ -2973,6 +3034,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VSQRTSDZr_Int, X86::VSQRTSDZm_Int, TB_NO_REVERSE }, { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, TB_NO_REVERSE }, + { X86::VSQRTSHZr, X86::VSQRTSHZm, 0 }, + { X86::VSQRTSHZr_Int, X86::VSQRTSHZm_Int, TB_NO_REVERSE }, { X86::VSQRTSSZr, X86::VSQRTSSZm, 0 }, { X86::VSQRTSSZr_Int, X86::VSQRTSSZm_Int, TB_NO_REVERSE }, { X86::VSQRTSSr, X86::VSQRTSSm, 0 }, @@ -3539,18 +3602,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { { X86::VGETEXPPDZ128rk, X86::VGETEXPPDZ128mk, 0 }, { X86::VGETEXPPDZ256rk, X86::VGETEXPPDZ256mk, 0 }, { X86::VGETEXPPDZrk, X86::VGETEXPPDZmk, 0 }, + { X86::VGETEXPPHZ128rk, X86::VGETEXPPHZ128mk, 0 }, + { X86::VGETEXPPHZ256rk, X86::VGETEXPPHZ256mk, 0 }, + { X86::VGETEXPPHZrk, X86::VGETEXPPHZmk, 0 }, { X86::VGETEXPPSZ128rk, X86::VGETEXPPSZ128mk, 0 }, { X86::VGETEXPPSZ256rk, X86::VGETEXPPSZ256mk, 0 }, { X86::VGETEXPPSZrk, X86::VGETEXPPSZmk, 0 }, { X86::VGETEXPSDZrkz, X86::VGETEXPSDZmkz, TB_NO_REVERSE }, + { X86::VGETEXPSHZrkz, X86::VGETEXPSHZmkz, TB_NO_REVERSE }, { X86::VGETEXPSSZrkz, X86::VGETEXPSSZmkz, TB_NO_REVERSE }, { X86::VGETMANTPDZ128rrik, X86::VGETMANTPDZ128rmik, 0 }, { X86::VGETMANTPDZ256rrik, X86::VGETMANTPDZ256rmik, 0 }, { X86::VGETMANTPDZrrik, X86::VGETMANTPDZrmik, 0 }, + { X86::VGETMANTPHZ128rrik, X86::VGETMANTPHZ128rmik, 0 }, + { X86::VGETMANTPHZ256rrik, X86::VGETMANTPHZ256rmik, 0 }, + { X86::VGETMANTPHZrrik, X86::VGETMANTPHZrmik, 0 }, { X86::VGETMANTPSZ128rrik, X86::VGETMANTPSZ128rmik, 0 }, { X86::VGETMANTPSZ256rrik, X86::VGETMANTPSZ256rmik, 0 }, { X86::VGETMANTPSZrrik, X86::VGETMANTPSZrmik, 0 }, { X86::VGETMANTSDZrrikz, X86::VGETMANTSDZrmikz, TB_NO_REVERSE }, + { X86::VGETMANTSHZrrikz, X86::VGETMANTSHZrmikz, TB_NO_REVERSE }, { X86::VGETMANTSSZrrikz, X86::VGETMANTSSZrmikz, TB_NO_REVERSE }, { X86::VGF2P8AFFINEINVQBZ128rrikz, X86::VGF2P8AFFINEINVQBZ128rmikz, 0 }, { X86::VGF2P8AFFINEINVQBZ256rrikz, X86::VGF2P8AFFINEINVQBZ256rmikz, 0 }, @@ -4331,21 +4402,33 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { { X86::VRCP28PSZrk, X86::VRCP28PSZmk, 0 }, { X86::VRCP28SDZrkz, X86::VRCP28SDZmkz, TB_NO_REVERSE }, { X86::VRCP28SSZrkz, X86::VRCP28SSZmkz, TB_NO_REVERSE }, + { X86::VRCPPHZ128rk, X86::VRCPPHZ128mk, 0 }, + { X86::VRCPPHZ256rk, X86::VRCPPHZ256mk, 0 }, + { X86::VRCPPHZrk, X86::VRCPPHZmk, 0 }, + { X86::VRCPSHZrrkz, X86::VRCPSHZrmkz, TB_NO_REVERSE }, { X86::VREDUCEPDZ128rrik, X86::VREDUCEPDZ128rmik, 0 }, { X86::VREDUCEPDZ256rrik, X86::VREDUCEPDZ256rmik, 0 }, { X86::VREDUCEPDZrrik, X86::VREDUCEPDZrmik, 0 }, + { X86::VREDUCEPHZ128rrik, X86::VREDUCEPHZ128rmik, 0 }, + { X86::VREDUCEPHZ256rrik, X86::VREDUCEPHZ256rmik, 0 }, + { X86::VREDUCEPHZrrik, X86::VREDUCEPHZrmik, 0 }, { X86::VREDUCEPSZ128rrik, X86::VREDUCEPSZ128rmik, 0 }, { X86::VREDUCEPSZ256rrik, X86::VREDUCEPSZ256rmik, 0 }, { X86::VREDUCEPSZrrik, X86::VREDUCEPSZrmik, 0 }, { X86::VREDUCESDZrrikz, X86::VREDUCESDZrmikz, TB_NO_REVERSE }, + { X86::VREDUCESHZrrikz, X86::VREDUCESHZrmikz, TB_NO_REVERSE }, { X86::VREDUCESSZrrikz, X86::VREDUCESSZrmikz, TB_NO_REVERSE }, { X86::VRNDSCALEPDZ128rrik, X86::VRNDSCALEPDZ128rmik, 0 }, { X86::VRNDSCALEPDZ256rrik, X86::VRNDSCALEPDZ256rmik, 0 }, { X86::VRNDSCALEPDZrrik, X86::VRNDSCALEPDZrmik, 0 }, + { X86::VRNDSCALEPHZ128rrik, X86::VRNDSCALEPHZ128rmik, 0 }, + { X86::VRNDSCALEPHZ256rrik, X86::VRNDSCALEPHZ256rmik, 0 }, + { X86::VRNDSCALEPHZrrik, X86::VRNDSCALEPHZrmik, 0 }, { X86::VRNDSCALEPSZ128rrik, X86::VRNDSCALEPSZ128rmik, 0 }, { X86::VRNDSCALEPSZ256rrik, X86::VRNDSCALEPSZ256rmik, 0 }, { X86::VRNDSCALEPSZrrik, X86::VRNDSCALEPSZrmik, 0 }, { X86::VRNDSCALESDZr_Intkz, X86::VRNDSCALESDZm_Intkz, TB_NO_REVERSE }, + { X86::VRNDSCALESHZr_Intkz, X86::VRNDSCALESHZm_Intkz, TB_NO_REVERSE }, { X86::VRNDSCALESSZr_Intkz, X86::VRNDSCALESSZm_Intkz, TB_NO_REVERSE }, { X86::VRSQRT14PDZ128rk, X86::VRSQRT14PDZ128mk, 0 }, { X86::VRSQRT14PDZ256rk, X86::VRSQRT14PDZ256mk, 0 }, @@ -4359,13 +4442,21 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { { X86::VRSQRT28PSZrk, X86::VRSQRT28PSZmk, 0 }, { X86::VRSQRT28SDZrkz, X86::VRSQRT28SDZmkz, TB_NO_REVERSE }, { X86::VRSQRT28SSZrkz, X86::VRSQRT28SSZmkz, TB_NO_REVERSE }, + { X86::VRSQRTPHZ128rk, X86::VRSQRTPHZ128mk, 0 }, + { X86::VRSQRTPHZ256rk, X86::VRSQRTPHZ256mk, 0 }, + { X86::VRSQRTPHZrk, X86::VRSQRTPHZmk, 0 }, + { X86::VRSQRTSHZrrkz, X86::VRSQRTSHZrmkz, TB_NO_REVERSE }, { X86::VSCALEFPDZ128rrkz, X86::VSCALEFPDZ128rmkz, 0 }, { X86::VSCALEFPDZ256rrkz, X86::VSCALEFPDZ256rmkz, 0 }, { X86::VSCALEFPDZrrkz, X86::VSCALEFPDZrmkz, 0 }, + { X86::VSCALEFPHZ128rrkz, X86::VSCALEFPHZ128rmkz, 0 }, + { X86::VSCALEFPHZ256rrkz, X86::VSCALEFPHZ256rmkz, 0 }, + { X86::VSCALEFPHZrrkz, X86::VSCALEFPHZrmkz, 0 }, { X86::VSCALEFPSZ128rrkz, X86::VSCALEFPSZ128rmkz, 0 }, { X86::VSCALEFPSZ256rrkz, X86::VSCALEFPSZ256rmkz, 0 }, { X86::VSCALEFPSZrrkz, X86::VSCALEFPSZrmkz, 0 }, { X86::VSCALEFSDZrrkz, X86::VSCALEFSDZrmkz, TB_NO_REVERSE }, + { X86::VSCALEFSHZrrkz, X86::VSCALEFSHZrmkz, TB_NO_REVERSE }, { X86::VSCALEFSSZrrkz, X86::VSCALEFSSZrmkz, TB_NO_REVERSE }, { X86::VSHUFF32X4Z256rrikz, X86::VSHUFF32X4Z256rmikz, 0 }, { X86::VSHUFF32X4Zrrikz, X86::VSHUFF32X4Zrmikz, 0 }, @@ -4384,10 +4475,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { { X86::VSQRTPDZ128rk, X86::VSQRTPDZ128mk, 0 }, { X86::VSQRTPDZ256rk, X86::VSQRTPDZ256mk, 0 }, { X86::VSQRTPDZrk, X86::VSQRTPDZmk, 0 }, + { X86::VSQRTPHZ128rk, X86::VSQRTPHZ128mk, 0 }, + { X86::VSQRTPHZ256rk, X86::VSQRTPHZ256mk, 0 }, + { X86::VSQRTPHZrk, X86::VSQRTPHZmk, 0 }, { X86::VSQRTPSZ128rk, X86::VSQRTPSZ128mk, 0 }, { X86::VSQRTPSZ256rk, X86::VSQRTPSZ256mk, 0 }, { X86::VSQRTPSZrk, X86::VSQRTPSZmk, 0 }, { X86::VSQRTSDZr_Intkz, X86::VSQRTSDZm_Intkz, TB_NO_REVERSE }, + { X86::VSQRTSHZr_Intkz, X86::VSQRTSHZm_Intkz, TB_NO_REVERSE }, { X86::VSQRTSSZr_Intkz, X86::VSQRTSSZm_Intkz, TB_NO_REVERSE }, { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 }, { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 }, @@ -4763,8 +4858,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { { X86::VFNMSUB231SSZr_Intk, X86::VFNMSUB231SSZm_Intk, TB_NO_REVERSE }, { X86::VFNMSUB231SSZr_Intkz, X86::VFNMSUB231SSZm_Intkz, TB_NO_REVERSE }, { X86::VGETEXPSDZrk, X86::VGETEXPSDZmk, TB_NO_REVERSE }, + { X86::VGETEXPSHZrk, X86::VGETEXPSHZmk, TB_NO_REVERSE }, { X86::VGETEXPSSZrk, X86::VGETEXPSSZmk, TB_NO_REVERSE }, { X86::VGETMANTSDZrrik, X86::VGETMANTSDZrmik, TB_NO_REVERSE }, + { X86::VGETMANTSHZrrik, X86::VGETMANTSHZrmik, TB_NO_REVERSE }, { X86::VGETMANTSSZrrik, X86::VGETMANTSSZrmik, TB_NO_REVERSE }, { X86::VGF2P8AFFINEINVQBZ128rrik, X86::VGF2P8AFFINEINVQBZ128rmik, 0 }, { X86::VGF2P8AFFINEINVQBZ256rrik, X86::VGF2P8AFFINEINVQBZ256rmik, 0 }, @@ -5320,21 +5417,29 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { { X86::VRCP14SSZrrk, X86::VRCP14SSZrmk, TB_NO_REVERSE }, { X86::VRCP28SDZrk, X86::VRCP28SDZmk, TB_NO_REVERSE }, { X86::VRCP28SSZrk, X86::VRCP28SSZmk, TB_NO_REVERSE }, + { X86::VRCPSHZrrk, X86::VRCPSHZrmk, TB_NO_REVERSE }, { X86::VREDUCESDZrrik, X86::VREDUCESDZrmik, TB_NO_REVERSE }, + { X86::VREDUCESHZrrik, X86::VREDUCESHZrmik, TB_NO_REVERSE }, { X86::VREDUCESSZrrik, X86::VREDUCESSZrmik, TB_NO_REVERSE }, { X86::VRNDSCALESDZr_Intk, X86::VRNDSCALESDZm_Intk, TB_NO_REVERSE }, + { X86::VRNDSCALESHZr_Intk, X86::VRNDSCALESHZm_Intk, TB_NO_REVERSE }, { X86::VRNDSCALESSZr_Intk, X86::VRNDSCALESSZm_Intk, TB_NO_REVERSE }, { X86::VRSQRT14SDZrrk, X86::VRSQRT14SDZrmk, TB_NO_REVERSE }, { X86::VRSQRT14SSZrrk, X86::VRSQRT14SSZrmk, TB_NO_REVERSE }, { X86::VRSQRT28SDZrk, X86::VRSQRT28SDZmk, TB_NO_REVERSE }, { X86::VRSQRT28SSZrk, X86::VRSQRT28SSZmk, TB_NO_REVERSE }, + { X86::VRSQRTSHZrrk, X86::VRSQRTSHZrmk, TB_NO_REVERSE }, { X86::VSCALEFPDZ128rrk, X86::VSCALEFPDZ128rmk, 0 }, { X86::VSCALEFPDZ256rrk, X86::VSCALEFPDZ256rmk, 0 }, { X86::VSCALEFPDZrrk, X86::VSCALEFPDZrmk, 0 }, + { X86::VSCALEFPHZ128rrk, X86::VSCALEFPHZ128rmk, 0 }, + { X86::VSCALEFPHZ256rrk, X86::VSCALEFPHZ256rmk, 0 }, + { X86::VSCALEFPHZrrk, X86::VSCALEFPHZrmk, 0 }, { X86::VSCALEFPSZ128rrk, X86::VSCALEFPSZ128rmk, 0 }, { X86::VSCALEFPSZ256rrk, X86::VSCALEFPSZ256rmk, 0 }, { X86::VSCALEFPSZrrk, X86::VSCALEFPSZrmk, 0 }, { X86::VSCALEFSDZrrk, X86::VSCALEFSDZrmk, TB_NO_REVERSE }, + { X86::VSCALEFSHZrrk, X86::VSCALEFSHZrmk, TB_NO_REVERSE }, { X86::VSCALEFSSZrrk, X86::VSCALEFSSZrmk, TB_NO_REVERSE }, { X86::VSHUFF32X4Z256rrik, X86::VSHUFF32X4Z256rmik, 0 }, { X86::VSHUFF32X4Zrrik, X86::VSHUFF32X4Zrmik, 0 }, @@ -5351,6 +5456,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { { X86::VSHUFPSZ256rrik, X86::VSHUFPSZ256rmik, 0 }, { X86::VSHUFPSZrrik, X86::VSHUFPSZrmik, 0 }, { X86::VSQRTSDZr_Intk, X86::VSQRTSDZm_Intk, TB_NO_REVERSE }, + { X86::VSQRTSHZr_Intk, X86::VSQRTSHZm_Intk, TB_NO_REVERSE }, { X86::VSQRTSSZr_Intk, X86::VSQRTSSZm_Intk, TB_NO_REVERSE }, { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 }, { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 }, diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index a57a956e8135ef..7f0e151b9eba22 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -5269,6 +5269,29 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum, case X86::VRCP14SDZrm: case X86::VRCP14SSZrr: case X86::VRCP14SSZrm: + case X86::VRCPSHZrr: + case X86::VRCPSHZrm: + case X86::VRSQRTSHZrr: + case X86::VRSQRTSHZrm: + case X86::VREDUCESHZrmi: + case X86::VREDUCESHZrri: + case X86::VREDUCESHZrrib: + case X86::VGETEXPSHZr: + case X86::VGETEXPSHZrb: + case X86::VGETEXPSHZm: + case X86::VGETMANTSHZrri: + case X86::VGETMANTSHZrrib: + case X86::VGETMANTSHZrmi: + case X86::VRNDSCALESHZr: + case X86::VRNDSCALESHZr_Int: + case X86::VRNDSCALESHZrb_Int: + case X86::VRNDSCALESHZm: + case X86::VRNDSCALESHZm_Int: + case X86::VSQRTSHZr: + case X86::VSQRTSHZr_Int: + case X86::VSQRTSHZrb_Int: + case X86::VSQRTSHZm: + case X86::VSQRTSHZm_Int: case X86::VRCP28SDZr: case X86::VRCP28SDZrb: case X86::VRCP28SDZm: diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index db7e42b20cb145..efc4811084f949 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -989,6 +989,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512bf16_mask_cvtneps2bf16_128, CVTNEPS2BF16_MASK, X86ISD::CVTNEPS2BF16, X86ISD::MCVTNEPS2BF16), X86_INTRINSIC_DATA(avx512fp16_add_ph_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512fp16_div_ph_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND), + X86_INTRINSIC_DATA(avx512fp16_fpclass_ph_128, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512fp16_fpclass_ph_256, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512fp16_fpclass_ph_512, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0), X86_INTRINSIC_DATA(avx512fp16_mask_add_sh_round, INTR_TYPE_SCALAR_MASK, X86ISD::FADDS, X86ISD::FADDS_RND), X86_INTRINSIC_DATA(avx512fp16_mask_cmp_ph_128, CMP_MASK_CC, X86ISD::CMPMM, 0), @@ -998,12 +1001,52 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FSETCCM, X86ISD::FSETCCM_SAE), X86_INTRINSIC_DATA(avx512fp16_mask_div_sh_round, INTR_TYPE_SCALAR_MASK, X86ISD::FDIVS, X86ISD::FDIVS_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_fpclass_sh, FPCLASSS, X86ISD::VFPCLASSS, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_getexp_ph_128, INTR_TYPE_1OP_MASK, X86ISD::FGETEXP, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_getexp_ph_256, INTR_TYPE_1OP_MASK, X86ISD::FGETEXP, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_getexp_ph_512, INTR_TYPE_1OP_MASK_SAE, + X86ISD::FGETEXP, X86ISD::FGETEXP_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_getexp_sh, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::FGETEXPS, X86ISD::FGETEXPS_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_getmant_ph_128, INTR_TYPE_2OP_MASK_SAE, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_getmant_ph_256, INTR_TYPE_2OP_MASK_SAE, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_getmant_ph_512, INTR_TYPE_2OP_MASK_SAE, + X86ISD::VGETMANT, X86ISD::VGETMANT_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_getmant_sh, INTR_TYPE_3OP_SCALAR_MASK_SAE, + X86ISD::VGETMANTS, X86ISD::VGETMANTS_SAE), X86_INTRINSIC_DATA(avx512fp16_mask_max_sh_round, INTR_TYPE_SCALAR_MASK_SAE, X86ISD::FMAXS, X86ISD::FMAXS_SAE), X86_INTRINSIC_DATA(avx512fp16_mask_min_sh_round, INTR_TYPE_SCALAR_MASK_SAE, X86ISD::FMINS, X86ISD::FMINS_SAE), X86_INTRINSIC_DATA(avx512fp16_mask_mul_sh_round, INTR_TYPE_SCALAR_MASK, X86ISD::FMULS, X86ISD::FMULS_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_rcp_ph_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_rcp_ph_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_rcp_ph_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_rcp_sh, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_reduce_ph_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_reduce_ph_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_reduce_ph_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, X86ISD::VREDUCE_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_reduce_sh, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_rndscale_ph_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_rndscale_ph_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_rndscale_ph_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_rndscale_sh, INTR_TYPE_SCALAR_MASK, + X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_rsqrt_ph_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_rsqrt_ph_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_rsqrt_ph_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_rsqrt_sh, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_scalef_ph_128, INTR_TYPE_2OP_MASK, X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_scalef_ph_256, INTR_TYPE_2OP_MASK, X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_scalef_ph_512, INTR_TYPE_2OP_MASK, + X86ISD::SCALEF, X86ISD::SCALEF_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_scalef_sh, INTR_TYPE_SCALAR_MASK, + X86ISD::SCALEFS, X86ISD::SCALEFS_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_sqrt_sh, INTR_TYPE_SCALAR_MASK, + X86ISD::FSQRTS, X86ISD::FSQRTS_RND), X86_INTRINSIC_DATA(avx512fp16_mask_sub_sh_round, INTR_TYPE_SCALAR_MASK, X86ISD::FSUBS, X86ISD::FSUBS_RND), X86_INTRINSIC_DATA(avx512fp16_mask_vcvtdq2ph_128, TRUNCATE_TO_REG, @@ -1124,6 +1167,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512fp16_min_ph_256, INTR_TYPE_2OP, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx512fp16_min_ph_512, INTR_TYPE_2OP_SAE, X86ISD::FMIN, X86ISD::FMIN_SAE), X86_INTRINSIC_DATA(avx512fp16_mul_ph_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND), + X86_INTRINSIC_DATA(avx512fp16_sqrt_ph_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND), X86_INTRINSIC_DATA(avx512fp16_sub_ph_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND), X86_INTRINSIC_DATA(avx512fp16_vcomi_sh, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), /*fp16 scalar convert instruction*/ diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith.ll b/llvm/test/CodeGen/X86/avx512fp16-arith.ll index 194b1c48c38466..e897c195b90687 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-arith.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-arith.ll @@ -274,6 +274,174 @@ entry: ret <32 x i1> %0 } +define half @fneg(half %x) { +; CHECK-LABEL: fneg: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = fneg half %x + ret half %a +} + +define half @fneg_idiom(half %x) { +; CHECK-LABEL: fneg_idiom: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = fsub half -0.0, %x + ret half %a +} + +define half @fabs(half %x) { +; CHECK-LABEL: fabs: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = call half @llvm.fabs.f16(half %x) + ret half %a +} +declare half @llvm.fabs.f16(half) + +define half @fcopysign(half %x, half %y) { +; CHECK-LABEL: fcopysign: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; CHECK-NEXT: vpternlogq $226, %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %a = call half @llvm.copysign.f16(half %x, half %y) + ret half %a +} +declare half @llvm.copysign.f16(half, half) + +define <8 x half> @fnegv8f16(<8 x half> %x) { +; CHECK-LABEL: fnegv8f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = fneg <8 x half> %x + ret <8 x half> %a +} + +define <8 x half> @fneg_idiomv8f16(<8 x half> %x) { +; CHECK-LABEL: fneg_idiomv8f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = fsub <8 x half> , %x + ret <8 x half> %a +} + +define <8 x half> @fabsv8f16(<8 x half> %x) { +; CHECK-LABEL: fabsv8f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.fabs.v8f16(<8 x half> %x) + ret <8 x half> %a +} +declare <8 x half> @llvm.fabs.v8f16(<8 x half>) + +define <8 x half> @fcopysignv8f16(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: fcopysignv8f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; CHECK-NEXT: vpternlogq $226, %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: retq + %a = call <8 x half> @llvm.copysign.v8f16(<8 x half> %x, <8 x half> %y) + ret <8 x half> %a +} +declare <8 x half> @llvm.copysign.v8f16(<8 x half>, <8 x half>) + +define <16 x half> @fnegv16f16(<16 x half> %x) { +; CHECK-LABEL: fnegv16f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %a = fneg <16 x half> %x + ret <16 x half> %a +} + +define <16 x half> @fneg_idiomv16f16(<16 x half> %x) { +; CHECK-LABEL: fneg_idiomv16f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %a = fsub <16 x half> , %x + ret <16 x half> %a +} + +define <16 x half> @fabsv16f16(<16 x half> %x) { +; CHECK-LABEL: fabsv16f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.fabs.v16f16(<16 x half> %x) + ret <16 x half> %a +} +declare <16 x half> @llvm.fabs.v16f16(<16 x half>) + +define <16 x half> @fcopysignv16f16(<16 x half> %x, <16 x half> %y) { +; CHECK-LABEL: fcopysignv16f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; CHECK-NEXT: vpternlogq $226, %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: retq + %a = call <16 x half> @llvm.copysign.v16f16(<16 x half> %x, <16 x half> %y) + ret <16 x half> %a +} +declare <16 x half> @llvm.copysign.v16f16(<16 x half>, <16 x half>) + +define <32 x half> @fnegv32f16(<32 x half> %x) { +; CHECK-LABEL: fnegv32f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %a = fneg <32 x half> %x + ret <32 x half> %a +} + +define <32 x half> @fneg_idiomv32f16(<32 x half> %x) { +; CHECK-LABEL: fneg_idiomv32f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %a = fsub <32 x half> , %x + ret <32 x half> %a +} + +define <32 x half> @fabsv32f16(<32 x half> %x) { +; CHECK-LABEL: fabsv32f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.fabs.v32f16(<32 x half> %x) + ret <32 x half> %a +} +declare <32 x half> @llvm.fabs.v32f16(<32 x half>) + +define <32 x half> @fcopysignv32f16(<32 x half> %x, <32 x half> %y) { +; CHECK-LABEL: fcopysignv32f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; CHECK-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq + %a = call <32 x half> @llvm.copysign.v32f16(<32 x half> %x, <32 x half> %y) + ret <32 x half> %a +} +declare <32 x half> @llvm.copysign.v32f16(<32 x half>, <32 x half>) + define <8 x half> @regression_test1(<8 x half> %x, <8 x half> %y) #0 { ; CHECK-LABEL: regression_test1: ; CHECK: ## %bb.0: ## %entry diff --git a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll index cb31baf9a82eaa..170e1ea1a6a927 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll @@ -13,6 +13,472 @@ define i32 @test_x86_avx512fp16_ucomi_sh_lt(<8 x half> %a0, <8 x half> %a1) { ret i32 %res } +declare <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half>, i32) nounwind readnone + +define <32 x half> @test_sqrt_ph_512(<32 x half> %a0) { +; CHECK-LABEL: test_sqrt_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vsqrtph %zmm0, %zmm0 +; CHECK-NEXT: retq + %1 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0) + ret <32 x half> %1 +} + +define <32 x half> @test_mask_sqrt_ph_512(<32 x half> %a0, <32 x half> %passthru, i32 %mask) { +; CHECK-LABEL: test_mask_sqrt_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsqrtph %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %1 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x half> %1, <32 x half> %passthru + ret <32 x half> %3 +} + +define <32 x half> @test_maskz_sqrt_ph_512(<32 x half> %a0, i32 %mask) { +; CHECK-LABEL: test_maskz_sqrt_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsqrtph %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x half> %1, <32 x half> zeroinitializer + ret <32 x half> %3 +} + +declare <32 x half> @llvm.sqrt.v32f16(<32 x half>) + +define <32 x half> @test_sqrt_round_ph_512(<32 x half> %a0) { +; CHECK-LABEL: test_sqrt_round_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vsqrtph {rz-sae}, %zmm0, %zmm0 +; CHECK-NEXT: retq + %1 = call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %a0, i32 11) + ret <32 x half> %1 +} + +define <32 x half> @test_mask_sqrt_round_ph_512(<32 x half> %a0, <32 x half> %passthru, i32 %mask) { +; CHECK-LABEL: test_mask_sqrt_round_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsqrtph {rz-sae}, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %1 = call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %a0, i32 11) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x half> %1, <32 x half> %passthru + ret <32 x half> %3 +} + +define <32 x half> @test_maskz_sqrt_round_ph_512(<32 x half> %a0, i32 %mask) { +; CHECK-LABEL: test_maskz_sqrt_round_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsqrtph {rz-sae}, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %a0, i32 11) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x half> %1, <32 x half> zeroinitializer + ret <32 x half> %3 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32) nounwind readnone + +define <8 x half> @test_sqrt_sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask) { +; CHECK-LABEL: test_sqrt_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsqrtsh %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask, i32 4) + ret <8 x half> %res +} + +define <8 x half> @test_sqrt_sh_r(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask) { +; CHECK-LABEL: test_sqrt_sh_r: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsqrtsh {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask, i32 10) + ret <8 x half> %res +} + +define <8 x half> @test_sqrt_sh_nomask(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { +; CHECK-LABEL: test_sqrt_sh_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsqrtsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 -1, i32 4) + ret <8 x half> %res +} + +define <8 x half> @test_sqrt_sh_z(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask) { +; CHECK-LABEL: test_sqrt_sh_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsqrtsh {ru-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %mask, i32 10) + ret <8 x half> %res +} + +declare <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half>, <32 x half>, i32) +declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8) + +define <32 x half> @test_rsqrt_ph_512(<32 x half> %a0) { +; CHECK-LABEL: test_rsqrt_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vrsqrtph %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 -1) + ret <32 x half> %res +} + +define <8 x half> @test_rsqrt_sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) { +; CHECK-LABEL: test_rsqrt_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: vrsqrtsh %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a0, <8 x half> %a2, i8 -1) + ret <8 x half> %res +} + +define <8 x half> @test_rsqrt_sh_load(<8 x half> %a0, <8 x half>* %a1ptr) { +; CHECK-LABEL: test_rsqrt_sh_load: +; CHECK: # %bb.0: +; CHECK-NEXT: vrsqrtsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %a1 = load <8 x half>, <8 x half>* %a1ptr + %res = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1) + ret <8 x half> %res +} + +define <8 x half> @test_rsqrt_sh_maskz(<8 x half> %a0, i8 %mask) { +; CHECK-LABEL: test_rsqrt_sh_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vrsqrtsh %xmm0, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a0, <8 x half> zeroinitializer, i8 %mask) + ret <8 x half> %res +} + +define <8 x half> @test_rsqrt_sh_mask(<8 x half> %a0, <8 x half> %b0, <8 x half> %c0, i8 %mask) { +; CHECK-LABEL: test_rsqrt_sh_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vrsqrtsh %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %b0, <8 x half> %c0, i8 %mask) + ret <8 x half> %res +} + +declare <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half>, i32) + +define i32 @test_int_x86_avx512_fpclass_ph_512(<32 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_fpclass_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vfpclassph $2, %zmm0, %k1 +; CHECK-NEXT: vfpclassph $4, %zmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %x0, i32 4) + %res1 = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %x0, i32 2) + %1 = and <32 x i1> %res1, %res + %2 = bitcast <32 x i1> %1 to i32 + ret i32 %2 +} + +declare i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half>, i32, i8) + +define i8 @test_int_x86_avx512_mask_fpclass_sh(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: vfpclasssh $4, %xmm0, %k1 +; CHECK-NEXT: vfpclasssh $2, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq + %res = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %x0, i32 2, i8 -1) + %res1 = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %x0, i32 4, i8 %res) + ret i8 %res1 +} + +define i8 @test_int_x86_avx512_mask_fpclass_sh_load(<8 x half>* %x0ptr) { +; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_sh_load: +; CHECK: # %bb.0: +; CHECK-NEXT: vfpclasssh $4, (%rdi), %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq + %x0 = load <8 x half>, <8 x half>* %x0ptr + %res = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %x0, i32 4, i8 -1) + ret i8 %res +} + +declare <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half>, <32 x half>, i32) + +define <32 x half> @test_rcp_ph_512(<32 x half> %a0, <32 x half> %a1, i32 %mask) { +; CHECK-LABEL: test_rcp_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vrcpph %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> %a1, i32 %mask) + ret <32 x half> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half>, <8 x half>, <8 x half>, i8) + +define <8 x half> @test_rcp_sh(<8 x half> %a0) { +; CHECK-LABEL: test_rcp_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: vrcpsh %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a0, <8 x half> zeroinitializer, i8 -1) + ret <8 x half> %res +} + +define <8 x half> @test_rcp_sh_load(<8 x half> %a0, <8 x half>* %a1ptr) { +; CHECK-LABEL: test_rcp_sh_load: +; CHECK: # %bb.0: +; CHECK-NEXT: vrcpsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %a1 = load <8 x half>, <8 x half>* %a1ptr + %res = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 -1) + ret <8 x half> %res +} + +declare <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half>, i32, <32 x half>, i32, i32) + +define <32 x half>@test_int_x86_avx512_mask_reduce_ph_512(<32 x half> %x0, <32 x half> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vreduceph $8, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vreduceph $4, {sae}, %zmm0, %zmm0 +; CHECK-NEXT: vaddph %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %x0, i32 8, <32 x half> %x2, i32 %x3, i32 4) + %res1 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %x0, i32 4, <32 x half> %x2, i32 -1, i32 8) + %res2 = fadd <32 x half> %res, %res1 + ret <32 x half> %res2 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half>, <8 x half>,<8 x half>, i8, i32, i32) + +define <8 x half>@test_int_x86_avx512_mask_reduce_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_reduce_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vreducesh $4, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4, i32 4, i32 4) + ret <8 x half> %res +} + +define <8 x half>@test_int_x86_avx512_mask_reduce_sh_nomask(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_reduce_sh_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vreducesh $4, {sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 -1, i32 4, i32 8) + ret <8 x half> %res +} + +declare <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half>, i32, <32 x half>, i32, i32) + +define <32 x half>@test_int_x86_avx512_mask_rndscale_ph_512(<32 x half> %x0, <32 x half> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vrndscaleph $8, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vrndscaleph $4, {sae}, %zmm0, %zmm0 +; CHECK-NEXT: vaddph %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %x0, i32 8, <32 x half> %x2, i32 %x3, i32 4) + %res1 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %x0, i32 4, <32 x half> %x2, i32 -1, i32 8) + %res2 = fadd <32 x half> %res, %res1 + ret <32 x half> %res2 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half>, <8 x half>,<8 x half>, i8, i32, i32) + +define <8 x half>@test_int_x86_avx512_mask_rndscale_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vrndscalesh $4, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4, i32 4, i32 4) + ret <8 x half> %res +} + +define <8 x half>@test_int_x86_avx512_mask_rndscale_sh_nomask(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_sh_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vrndscalesh $4, {sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 -1, i32 4, i32 8) + ret <8 x half> %res +} + +declare <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half>, <32 x half>, i32, i32) + +define <32 x half>@test_int_x86_avx512_mask_getexp_ph_512(<32 x half> %x0, <32 x half> %x1, i32 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_getexp_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vgetexpph %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vgetexpph {sae}, %zmm0, %zmm0 +; CHECK-NEXT: vaddph %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res1 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %x0, <32 x half> %x1, i32 %x2, i32 4) + %res2 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %x0, <32 x half> zeroinitializer, i32 -1, i32 8) + %res3 = fadd <32 x half> %res1, %res2 + ret <32 x half> %res3 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half>, <8 x half>,<8 x half>, i8, i32) + +define <8 x half>@test_int_x86_avx512_mask_getexp_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_getexp_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vgetexpsh %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4, i32 4) + ret <8 x half> %res +} + +define <8 x half>@test_int_x86_avx512_mask_getexp_sh_nomask(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_getexp_sh_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vgetexpsh {sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 -1, i32 8) + ret <8 x half> %res +} + +define <8 x half>@test_int_x86_avx512_mask_getexp_sh_load(<8 x half> %x0, <8 x half>* %x1ptr) { +; CHECK-LABEL: test_int_x86_avx512_mask_getexp_sh_load: +; CHECK: # %bb.0: +; CHECK-NEXT: vgetexpsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %x1 = load <8 x half>, <8 x half>* %x1ptr + %res = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> undef, i8 -1, i32 4) + ret <8 x half> %res +} + +declare <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half>, i32, <32 x half>, i32, i32) + +define <32 x half>@test_int_x86_avx512_mask_getmant_ph_512(<32 x half> %x0, <32 x half> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vgetmantph $8, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vgetmantph $4, {sae}, %zmm0, %zmm0 +; CHECK-NEXT: vaddph %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %x0, i32 8, <32 x half> %x2, i32 %x3, i32 4) + %res1 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %x0, i32 4, <32 x half> %x2, i32 -1, i32 8) + %res2 = fadd <32 x half> %res, %res1 + ret <32 x half> %res2 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half>, <8 x half>, i32, <8 x half>, i8, i32) + +define <8 x half>@test_int_x86_avx512_mask_getmant_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vgetmantsh $11, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %x0, <8 x half> %x1, i32 11, <8 x half> %x3, i8 %x4, i32 4) + ret <8 x half> %res +} + +define <8 x half>@test_int_x86_avx512_mask_getmant_sh_nomask(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sh_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vgetmantsh $11, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %x0, <8 x half> %x1, i32 11, <8 x half> %x3, i8 -1, i32 4) + ret <8 x half> %res +} + +define <8 x half>@test_int_x86_avx512_mask_getmant_sh_z(<8 x half> %x0, <8 x half> %x1, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sh_z: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vgetmantsh $11, %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %x0, <8 x half> %x1, i32 11, <8 x half> zeroinitializer, i8 %x4, i32 4) + ret <8 x half> %res +} + +declare <32 x half> @llvm.x86.avx512fp16.mask.scalef.ph.512(<32 x half>, <32 x half>, <32 x half>, i32, i32) + +define <32 x half>@test_int_x86_avx512_mask_scalef_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vscalefph {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vscalefph {rn-sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddph %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq + %mask = bitcast i32 %x3 to <32 x i1> + %res1 = call <32 x half> @llvm.x86.avx512fp16.mask.scalef.ph.512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3, i32 11) + %res2 = call <32 x half> @llvm.x86.avx512fp16.mask.scalef.ph.512(<32 x half> %x0, <32 x half> %x1, <32 x half> zeroinitializer, i32 -1, i32 8) + %res3 = fadd <32 x half> %res1, %res2 + ret <32 x half> %res3 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half>, <8 x half>,<8 x half>, i8, i32) + +define <8 x half>@test_int_x86_avx512_mask_scalef_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vscalefsh %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4, i32 4) + ret <8 x half> %res +} + +define <8 x half>@test_int_x86_avx512_mask_scalef_sh_nomask(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sh_nomask: +; CHECK: # %bb.0: +; CHECK-NEXT: vscalefsh {rn-sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 -1, i32 8) + ret <8 x half> %res +} + +define <8 x half>@test_int_x86_avx512_mask_scalef_sh_load(<8 x half> %x0, <8 x half>* %x1ptr) { +; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sh_load: +; CHECK: # %bb.0: +; CHECK-NEXT: vscalefsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %x1 = load <8 x half>, <8 x half>* %x1ptr + %res = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> undef, i8 -1, i32 4) + ret <8 x half> %res +} + declare <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) { diff --git a/llvm/test/CodeGen/X86/avx512fp16-rndscale.ll b/llvm/test/CodeGen/X86/avx512fp16-rndscale.ll new file mode 100644 index 00000000000000..c958b7e86d9f18 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-rndscale.ll @@ -0,0 +1,157 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512fp16 | FileCheck %s + +declare <8 x half> @llvm.ceil.v8f16(<8 x half>) +declare <16 x half> @llvm.ceil.v16f16(<16 x half>) +declare <32 x half> @llvm.ceil.v32f16(<32 x half>) + +define <8 x half> @ceil_v8f16(<8 x half> %p) { +; CHECK-LABEL: ceil_v8f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscaleph $10, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t = call <8 x half> @llvm.ceil.v8f16(<8 x half> %p) + ret <8 x half> %t +} + +define <16 x half> @ceil_v16f16(<16 x half> %p) { +; CHECK-LABEL: ceil_v16f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscaleph $10, %ymm0, %ymm0 +; CHECK-NEXT: retq + %t = call <16 x half> @llvm.ceil.v16f16(<16 x half> %p) + ret <16 x half> %t +} + +define <32 x half> @ceil_v32f16(<32 x half> %p) { +; CHECK-LABEL: ceil_v32f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscaleph $10, %zmm0, %zmm0 +; CHECK-NEXT: retq + %t = call <32 x half> @llvm.ceil.v32f16(<32 x half> %p) + ret <32 x half> %t +} + +declare <8 x half> @llvm.floor.v8f16(<8 x half>) +declare <16 x half> @llvm.floor.v16f16(<16 x half>) +declare <32 x half> @llvm.floor.v32f16(<32 x half>) + +define <8 x half> @floor_v8f16(<8 x half> %p) { +; CHECK-LABEL: floor_v8f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscaleph $9, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t = call <8 x half> @llvm.floor.v8f16(<8 x half> %p) + ret <8 x half> %t +} + +define <16 x half> @floor_v16f16(<16 x half> %p) { +; CHECK-LABEL: floor_v16f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscaleph $9, %ymm0, %ymm0 +; CHECK-NEXT: retq + %t = call <16 x half> @llvm.floor.v16f16(<16 x half> %p) + ret <16 x half> %t +} + +define <32 x half> @floor_v32f16(<32 x half> %p) { +; CHECK-LABEL: floor_v32f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscaleph $9, %zmm0, %zmm0 +; CHECK-NEXT: retq + %t = call <32 x half> @llvm.floor.v32f16(<32 x half> %p) + ret <32 x half> %t +} + +declare <8 x half> @llvm.trunc.v8f16(<8 x half>) +declare <16 x half> @llvm.trunc.v16f16(<16 x half>) +declare <32 x half> @llvm.trunc.v32f16(<32 x half>) + +define <8 x half> @trunc_v8f16(<8 x half> %p) { +; CHECK-LABEL: trunc_v8f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscaleph $11, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t = call <8 x half> @llvm.trunc.v8f16(<8 x half> %p) + ret <8 x half> %t +} + +define <16 x half> @trunc_v16f16(<16 x half> %p) { +; CHECK-LABEL: trunc_v16f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscaleph $11, %ymm0, %ymm0 +; CHECK-NEXT: retq + %t = call <16 x half> @llvm.trunc.v16f16(<16 x half> %p) + ret <16 x half> %t +} + +define <32 x half> @trunc_v32f16(<32 x half> %p) { +; CHECK-LABEL: trunc_v32f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscaleph $11, %zmm0, %zmm0 +; CHECK-NEXT: retq + %t = call <32 x half> @llvm.trunc.v32f16(<32 x half> %p) + ret <32 x half> %t +} + +declare <8 x half> @llvm.nearbyint.v8f16(<8 x half>) +declare <16 x half> @llvm.nearbyint.v16f16(<16 x half>) +declare <32 x half> @llvm.nearbyint.v32f16(<32 x half>) + +define <8 x half> @nearbyint_v8f16(<8 x half> %p) { +; CHECK-LABEL: nearbyint_v8f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscaleph $12, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %p) + ret <8 x half> %t +} + +define <16 x half> @nearbyint_v16f16(<16 x half> %p) { +; CHECK-LABEL: nearbyint_v16f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscaleph $12, %ymm0, %ymm0 +; CHECK-NEXT: retq + %t = call <16 x half> @llvm.nearbyint.v16f16(<16 x half> %p) + ret <16 x half> %t +} + +define <32 x half> @nearbyint_v32f16(<32 x half> %p) { +; CHECK-LABEL: nearbyint_v32f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscaleph $12, %zmm0, %zmm0 +; CHECK-NEXT: retq + %t = call <32 x half> @llvm.nearbyint.v32f16(<32 x half> %p) + ret <32 x half> %t +} + +declare <8 x half> @llvm.rint.v8f16(<8 x half>) +declare <16 x half> @llvm.rint.v16f16(<16 x half>) +declare <32 x half> @llvm.rint.v32f16(<32 x half>) + +define <8 x half> @rint_v8f16(<8 x half> %p) { +; CHECK-LABEL: rint_v8f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscaleph $4, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t = call <8 x half> @llvm.rint.v8f16(<8 x half> %p) + ret <8 x half> %t +} + +define <16 x half> @rint_v16f16(<16 x half> %p) { +; CHECK-LABEL: rint_v16f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscaleph $4, %ymm0, %ymm0 +; CHECK-NEXT: retq + %t = call <16 x half> @llvm.rint.v16f16(<16 x half> %p) + ret <16 x half> %t +} + +define <32 x half> @rint_v32f16(<32 x half> %p) { +; CHECK-LABEL: rint_v32f16: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscaleph $4, %zmm0, %zmm0 +; CHECK-NEXT: retq + %t = call <32 x half> @llvm.rint.v32f16(<32 x half> %p) + ret <32 x half> %t +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-scalar.ll b/llvm/test/CodeGen/X86/avx512fp16-scalar.ll new file mode 100644 index 00000000000000..36145e86469aab --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-scalar.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512fp16 --show-mc-encoding | FileCheck %s + +declare half @llvm.ceil.f16(half) +declare half @llvm.floor.f16(half) +declare half @llvm.trunc.f16(half) +declare half @llvm.rint.f16(half) +declare half @llvm.nearbyint.f16(half) + +define half @test_ceil(half %a) { +; CHECK-LABEL: test_ceil: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscalesh $10, %xmm0, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7c,0x08,0x0a,0xc0,0x0a] +; CHECK-NEXT: retq ## encoding: [0xc3] + %c = call half @llvm.ceil.f16(half %a) + ret half %c +} + +define half @test_floor(half %a) { +; CHECK-LABEL: test_floor: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscalesh $9, %xmm0, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7c,0x08,0x0a,0xc0,0x09] +; CHECK-NEXT: retq ## encoding: [0xc3] + %c = call half @llvm.floor.f16(half %a) + ret half %c +} + +define half @test_trunc(half %a) { +; CHECK-LABEL: test_trunc: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscalesh $11, %xmm0, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7c,0x08,0x0a,0xc0,0x0b] +; CHECK-NEXT: retq ## encoding: [0xc3] + %c = call half @llvm.trunc.f16(half %a) + ret half %c +} + +define half @test_rint(half %a) { +; CHECK-LABEL: test_rint: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscalesh $4, %xmm0, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7c,0x08,0x0a,0xc0,0x04] +; CHECK-NEXT: retq ## encoding: [0xc3] + %c = call half @llvm.rint.f16(half %a) + ret half %c +} + +define half @test_nearbyint(half %a) { +; CHECK-LABEL: test_nearbyint: +; CHECK: ## %bb.0: +; CHECK-NEXT: vrndscalesh $12, %xmm0, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7c,0x08,0x0a,0xc0,0x0c] +; CHECK-NEXT: retq ## encoding: [0xc3] + %c = call half @llvm.nearbyint.f16(half %a) + ret half %c +} diff --git a/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll index 6bfd69ef5db97b..93efbace3e759d 100644 --- a/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll @@ -945,3 +945,377 @@ define <4 x i64> @test_int_x86_avx512_maskz_cvtt_ph2uqq_256(<8 x half> %x0, i8 % %res = call <4 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.256(<8 x half> %x0, <4 x i64> zeroinitializer, i8 %x2) ret <4 x i64> %res } + +declare <8 x half> @llvm.sqrt.v8f16(<8 x half>) +declare <16 x half> @llvm.sqrt.v16f16(<16 x half>) + +define <8 x half> @test_sqrt_ph_128(<8 x half> %a0) { +; CHECK-LABEL: test_sqrt_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vsqrtph %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0) + ret <8 x half> %1 +} + +define <8 x half> @test_mask_sqrt_ph_128(<8 x half> %a0, <8 x half> %passthru, i8 %mask) { +; CHECK-LABEL: test_mask_sqrt_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsqrtph %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %1 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x half> %1, <8 x half> %passthru + ret <8 x half> %3 +} + +define <8 x half> @test_maskz_sqrt_ph_128(<8 x half> %a0, i8 %mask) { +; CHECK-LABEL: test_maskz_sqrt_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsqrtph %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x half> %1, <8 x half> zeroinitializer + ret <8 x half> %3 +} + +define <16 x half> @test_sqrt_ph_256(<16 x half> %a0) { +; CHECK-LABEL: test_sqrt_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vsqrtph %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0) + ret <16 x half> %1 +} + +define <16 x half> @test_mask_sqrt_ph_256(<16 x half> %a0, <16 x half> %passthru, i16 %mask) { +; CHECK-LABEL: test_mask_sqrt_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsqrtph %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %1 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x half> %1, <16 x half> %passthru + ret <16 x half> %3 +} + +define <16 x half> @test_maskz_sqrt_ph_256(<16 x half> %a0, i16 %mask) { +; CHECK-LABEL: test_maskz_sqrt_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsqrtph %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x half> %1, <16 x half> zeroinitializer + ret <16 x half> %3 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half>, <8 x half>, i8) +declare <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half>, <16 x half>, i16) + +define <8 x half> @test_rsqrt_ph_128(<8 x half> %a0) { +; CHECK-LABEL: test_rsqrt_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vrsqrtph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 -1) + ret <8 x half> %res +} + +define <16 x half> @test_rsqrt_ph_256(<16 x half> %a0) { +; CHECK-LABEL: test_rsqrt_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vrsqrtph %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 -1) + ret <16 x half> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half>, <8 x half>, i8) +declare <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half>, <16 x half>, i16) + +define <8 x half> @test_rcp_ph_128(<8 x half> %a0, <8 x half> %a1, i8 %mask) { +; CHECK-LABEL: test_rcp_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vrcpph %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> %a1, i8 %mask) + ret <8 x half> %res +} + +define <16 x half> @test_rcp_ph_256(<16 x half> %a0, <16 x half> %a1, i16 %mask) { +; CHECK-LABEL: test_rcp_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vrcpph %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> %a1, i16 %mask) + ret <16 x half> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half>, i32, <8 x half>, i8) +declare <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half>, i32, <16 x half>, i16) + +define <8 x half>@test_int_x86_avx512_mask_reduce_ph_128(<8 x half> %x0, <8 x half> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vreduceph $8, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vreduceph $4, %xmm0, %xmm0 +; CHECK-NEXT: vaddph %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %x0, i32 8, <8 x half> %x2, i8 %x3) + %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %x0, i32 4, <8 x half> %x2, i8 -1) + %res2 = fadd <8 x half> %res, %res1 + ret <8 x half> %res2 +} + +define <16 x half>@test_int_x86_avx512_mask_reduce_ph_256(<16 x half> %x0, <16 x half> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vreduceph $8, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vreduceph $4, %ymm0, %ymm0 +; CHECK-NEXT: vaddph %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %x0, i32 8, <16 x half> %x2, i16 %x3) + %res1 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %x0, i32 4, <16 x half> %x2, i16 -1) + %res2 = fadd <16 x half> %res, %res1 + ret <16 x half> %res2 +} + +declare <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half>, i32) +declare <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half>, i32) + +define i8 @test_int_x86_avx512_fpclass_ph_128(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_fpclass_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vfpclassph $2, %xmm0, %k1 +; CHECK-NEXT: vfpclassph $4, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq + %res = call <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half> %x0, i32 4) + %res1 = call <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half> %x0, i32 2) + %1 = and <8 x i1> %res1, %res + %2 = bitcast <8 x i1> %1 to i8 + ret i8 %2 +} + +define i16 @test_int_x86_avx512_fpclass_ph_256(<16 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_fpclass_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vfpclassph $2, %ymm0, %k1 +; CHECK-NEXT: vfpclassph $4, %ymm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %res = call <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half> %x0, i32 4) + %res1 = call <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half> %x0, i32 2) + %1 = and <16 x i1> %res1, %res + %2 = bitcast <16 x i1> %1 to i16 + ret i16 %2 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half>, <8 x half>, i8) +declare <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half>, <16 x half>, i16) + +define <8 x half>@test_int_x86_avx512_getexp_ph_128(<8 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_getexp_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vgetexpph %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %x0, <8 x half> zeroinitializer, i8 -1) + ret <8 x half> %res +} + +define <8 x half>@test_int_x86_avx512_mask_getexp_ph_128(<8 x half> %x0, <8 x half> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_getexp_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vgetexpph %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %x0, <8 x half> %x1, i8 %x2) + ret <8 x half> %res +} + +define <8 x half>@test_int_x86_avx512_maskz_getexp_ph_128(<8 x half> %x0, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_getexp_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vgetexpph %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %x0, <8 x half> zeroinitializer, i8 %x2) + ret <8 x half> %res +} + +define <16 x half>@test_int_x86_avx512_getexp_ph_256(<16 x half> %x0) { +; CHECK-LABEL: test_int_x86_avx512_getexp_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vgetexpph %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %x0, <16 x half> zeroinitializer, i16 -1) + ret <16 x half> %res +} + +define <16 x half>@test_int_x86_avx512_mask_getexp_ph_256(<16 x half> %x0, <16 x half> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_getexp_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vgetexpph %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %x0, <16 x half> %x1, i16 %x2) + ret <16 x half> %res +} + +define <16 x half>@test_int_x86_avx512_maskz_getexp_ph_256(<16 x half> %x0, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_maskz_getexp_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vgetexpph %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %x0, <16 x half> zeroinitializer, i16 %x2) + ret <16 x half> %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half>, i32, <8 x half>, i8) +declare <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half>, i32, <16 x half>, i16) + +define <8 x half>@test_int_x86_avx512_mask_getmant_ph_128(<8 x half> %x0, <8 x half> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vgetmantph $8, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vgetmantph $4, %xmm0, %xmm0 +; CHECK-NEXT: vaddph %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %x0, i32 8, <8 x half> %x2, i8 %x3) + %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %x0, i32 4, <8 x half> %x2, i8 -1) + %res2 = fadd <8 x half> %res, %res1 + ret <8 x half> %res2 +} + +define <16 x half>@test_int_x86_avx512_mask_getmant_ph_256(<16 x half> %x0, <16 x half> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vgetmantph $8, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vgetmantph $4, %ymm0, %ymm0 +; CHECK-NEXT: vaddph %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %x0, i32 8, <16 x half> %x2, i16 %x3) + %res1 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %x0, i32 4, <16 x half> %x2, i16 -1) + %res2 = fadd <16 x half> %res, %res1 + ret <16 x half> %res2 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half>, i32, <8 x half>, i8) +declare <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half>, i32, <16 x half>, i16) + +define <8 x half>@test_int_x86_avx512_mask_rndscale_ph_128(<8 x half> %x0, <8 x half> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vrndscaleph $8, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vrndscaleph $4, %xmm0, %xmm0 +; CHECK-NEXT: vaddph %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %x0, i32 8, <8 x half> %x2, i8 %x3) + %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %x0, i32 4, <8 x half> %x2, i8 -1) + %res2 = fadd <8 x half> %res, %res1 + ret <8 x half> %res2 +} + +define <16 x half>@test_int_x86_avx512_mask_rndscale_ph_256(<16 x half> %x0, <16 x half> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vrndscaleph $8, %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vrndscaleph $4, %ymm0, %ymm0 +; CHECK-NEXT: vaddph %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %x0, i32 8, <16 x half> %x2, i16 %x3) + %res1 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %x0, i32 4, <16 x half> %x2, i16 -1) + %res2 = fadd <16 x half> %res, %res1 + ret <16 x half> %res2 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.scalef.ph.128(<8 x half>, <8 x half>, <8 x half>, i8) +declare <16 x half> @llvm.x86.avx512fp16.mask.scalef.ph.256(<16 x half>, <16 x half>, <16 x half>, i16) + +define <8 x half>@test_int_x86_avx512_scalef_ph_128(<8 x half> %x0, <8 x half> %x1) { +; CHECK-LABEL: test_int_x86_avx512_scalef_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vscalefph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.ph.128(<8 x half> %x0, <8 x half> %x1, <8 x half> zeroinitializer, i8 -1) + ret <8 x half> %res +} + +define <8 x half>@test_int_x86_avx512_mask_scalef_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vscalefph %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %mask = bitcast i8 %x3 to <8 x i1> + %res = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.ph.128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3) + ret <8 x half> %res +} + +define <8 x half>@test_int_x86_avx512_maskz_scalef_ph_128(<8 x half> %x0, <8 x half> %x1, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_scalef_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vscalefph %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = bitcast i8 %x3 to <8 x i1> + %res = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.ph.128(<8 x half> %x0, <8 x half> %x1, <8 x half> zeroinitializer, i8 %x3) + ret <8 x half> %res +} + +define <16 x half>@test_int_x86_avx512_scalef_ph_256(<16 x half> %x0, <16 x half> %x1) { +; CHECK-LABEL: test_int_x86_avx512_scalef_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vscalefph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <16 x half> @llvm.x86.avx512fp16.mask.scalef.ph.256(<16 x half> %x0, <16 x half> %x1, <16 x half> zeroinitializer, i16 -1) + ret <16 x half> %res +} + +define <16 x half>@test_int_x86_avx512_mask_scalef_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vscalefph %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq + %mask = bitcast i16 %x3 to <16 x i1> + %res = call <16 x half> @llvm.x86.avx512fp16.mask.scalef.ph.256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3) + ret <16 x half> %res +} + +define <16 x half>@test_int_x86_avx512_maskz_scalef_ph_256(<16 x half> %x0, <16 x half> %x1, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_scalef_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vscalefph %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = bitcast i16 %x3 to <16 x i1> + %res = call <16 x half> @llvm.x86.avx512fp16.mask.scalef.ph.256(<16 x half> %x0, <16 x half> %x1, <16 x half> zeroinitializer, i16 %x3) + ret <16 x half> %res +} diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll index a2c289f1a26e4b..e7e52f153bc35f 100644 --- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll @@ -10,6 +10,7 @@ declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) declare double @llvm.experimental.constrained.fpext.f64.f16(half, metadata) declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) declare half @llvm.experimental.constrained.fptrunc.f16.f64(double, metadata, metadata) +declare half @llvm.experimental.constrained.sqrt.f16(half, metadata, metadata) define half @fadd_f16(half %a, half %b) nounwind strictfp { ; X86-LABEL: fadd_f16: @@ -173,4 +174,27 @@ define void @fptrunc_double_to_f16(double* %val, half *%ret) nounwind strictfp { ret void } +define void @fsqrt_f16(half* %a) nounwind strictfp { +; X86-LABEL: fsqrt_f16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovsh (%eax), %xmm0 +; X86-NEXT: vsqrtsh %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmovsh %xmm0, (%eax) +; X86-NEXT: retl +; +; X64-LABEL: fsqrt_f16: +; X64: # %bb.0: +; X64-NEXT: vmovsh (%rdi), %xmm0 +; X64-NEXT: vsqrtsh %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovsh %xmm0, (%rdi) +; X64-NEXT: retq + %1 = load half, half* %a, align 4 + %res = call half @llvm.experimental.constrained.sqrt.f16(half %1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + store half %res, half* %a, align 4 + ret void +} + attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll new file mode 100644 index 00000000000000..5832301aeb4e5e --- /dev/null +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll @@ -0,0 +1,88 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -O3 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -O3 | FileCheck %s --check-prefixes=X64 + +declare half @llvm.experimental.constrained.ceil.f16(half, metadata) +declare half @llvm.experimental.constrained.floor.f16(half, metadata) +declare half @llvm.experimental.constrained.trunc.f16(half, metadata) +declare half @llvm.experimental.constrained.rint.f16(half, metadata, metadata) +declare half @llvm.experimental.constrained.nearbyint.f16(half, metadata, metadata) + +define half @fceil32(half %f) #0 { +; X86-LABEL: fceil32: +; X86: # %bb.0: +; X86-NEXT: vrndscalesh $10, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: fceil32: +; X64: # %bb.0: +; X64-NEXT: vrndscalesh $10, %xmm0, %xmm0, %xmm0 +; X64-NEXT: retq + %res = call half @llvm.experimental.constrained.ceil.f16( + half %f, metadata !"fpexcept.strict") #0 + ret half %res +} + +define half @ffloor32(half %f) #0 { +; X86-LABEL: ffloor32: +; X86: # %bb.0: +; X86-NEXT: vrndscalesh $9, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: ffloor32: +; X64: # %bb.0: +; X64-NEXT: vrndscalesh $9, %xmm0, %xmm0, %xmm0 +; X64-NEXT: retq + %res = call half @llvm.experimental.constrained.floor.f16( + half %f, metadata !"fpexcept.strict") #0 + ret half %res +} + +define half @ftrunc32(half %f) #0 { +; X86-LABEL: ftrunc32: +; X86: # %bb.0: +; X86-NEXT: vrndscalesh $11, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: ftrunc32: +; X64: # %bb.0: +; X64-NEXT: vrndscalesh $11, %xmm0, %xmm0, %xmm0 +; X64-NEXT: retq + %res = call half @llvm.experimental.constrained.trunc.f16( + half %f, metadata !"fpexcept.strict") #0 + ret half %res +} + +define half @frint32(half %f) #0 { +; X86-LABEL: frint32: +; X86: # %bb.0: +; X86-NEXT: vrndscalesh $4, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: frint32: +; X64: # %bb.0: +; X64-NEXT: vrndscalesh $4, %xmm0, %xmm0, %xmm0 +; X64-NEXT: retq + %res = call half @llvm.experimental.constrained.rint.f16( + half %f, + metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret half %res +} + +define half @fnearbyint32(half %f) #0 { +; X86-LABEL: fnearbyint32: +; X86: # %bb.0: +; X86-NEXT: vrndscalesh $12, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: fnearbyint32: +; X64: # %bb.0: +; X64-NEXT: vrndscalesh $12, %xmm0, %xmm0, %xmm0 +; X64-NEXT: retq + %res = call half @llvm.experimental.constrained.nearbyint.f16( + half %f, + metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret half %res +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll index 222abba7d2f7c9..ef84bf32619e0f 100644 --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll @@ -124,6 +124,153 @@ define <8 x half> @stack_fold_divsh_int(<8 x half> %a0, <8 x half> %a1) { ret <8 x half> %5 } +define i32 @stack_fold_fpclassph(<32 x half> %a0) { + ;CHECK-LABEL: stack_fold_fpclassph: + ;CHECK: vfpclassphz $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %a0, i32 4) + %3 = bitcast <32 x i1> %2 to i32 + ret i32 %3 +} +declare <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half>, i32) + +define i32 @stack_fold_fpclassph_mask(<32 x half> %a0, <32 x i1>* %p) { + ;CHECK-LABEL: stack_fold_fpclassph_mask: + ;CHECK: vfpclassphz $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %a0, i32 4) + %mask = load <32 x i1>, <32 x i1>* %p + %3 = and <32 x i1> %2, %mask + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define i8 @stack_fold_fpclasssh(<8 x half> %a0) { + ;CHECK-LABEl: stack_fold_fpclasssh: + ;CHECK: vfpclasssh $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %a0, i32 4, i8 -1) + ret i8 %2 +} +declare i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half>, i32, i8) + +define i8 @stack_fold_fpclasssh_mask(<8 x half> %a0, i8* %p) { + ;CHECK-LABEL: stack_fold_fpclasssh_mask: + ;CHECK: vfpclasssh $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %mask = load i8, i8* %p + %2 = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %a0, i32 4, i8 %mask) + ret i8 %2 +} + +define <32 x half> @stack_fold_getexpph(<32 x half> %a0) { + ;CHECK-LABEL: stack_fold_getexpph: + ;CHECK: vgetexpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1, i32 4) + ret <32 x half> %2 +} +declare <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half>, <32 x half>, i32, i32) + +define <32 x half> @stack_fold_getexpph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) { + ;CHECK-LABEL: stack_fold_getexpph_mask: + ;CHECK: vgetexpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <32 x half>, <32 x half>* %passthru + %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask, i32 4) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_getexpph_maskz(<32 x half> %a0, i32* %mask) { + ;CHECK-LABEL: stack_fold_getexpph_maskz: + ;CHECK: vgetexpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i32, i32* %mask + %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2, i32 4) + ret <32 x half> %3 +} + +define <8 x half> @stack_fold_getexpsh(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_getexpsh: + ;CHECK: vgetexpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32) + +define <8 x half> @stack_fold_getexpsh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_getexpsh_mask: + ;CHECK: vgetexpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x half>, <8 x half>* %passthru + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_getexpsh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) { + ;CHECK-LABEL: stack_fold_getexpsh_maskz: + ;CHECK: vgetexpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 4) + ret <8 x half> %3 +} + +define <32 x half> @stack_fold_getmantph(<32 x half> %a0) { + ;CHECK-LABEL: stack_fold_getmantph: + ;CHECK: vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4) + ret <32 x half> %2 +} +declare <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half>, i32, <32 x half>, i32, i32) + +define <32 x half> @stack_fold_getmantph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) { + ;CHECK-LABEL: stack_fold_getmantph_mask: + ;CHECK: vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <32 x half>, <32 x half>* %passthru + %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_getmantph_maskz(<32 x half> %a0, i32* %mask) { + ;CHECK-LABEL: stack_fold_getmantph_maskz: + ;CHECK: vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i32, i32* %mask + %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4) + ret <32 x half> %3 +} + +define <8 x half> @stack_fold_getmantsh(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_getmantsh: + ;CHECK: vgetmantsh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> undef, i8 -1, i32 4) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half>, <8 x half>, i32, <8 x half>, i8, i32) + +define <8 x half> @stack_fold_getmantsh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_getmantsh_mask: + ;CHECK: vgetmantsh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x half>, <8 x half>* %passthru + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> %2, i8 %mask, i32 4) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_getmantsh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) { + ;CHECK-LABEL: stack_fold_getmantsh_maskz: + ;CHECK: vgetmantsh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> zeroinitializer, i8 %2, i32 4) + ret <8 x half> %3 +} + define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) #0 { ;CHECK-LABEL: stack_fold_maxph_zmm: ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload @@ -541,6 +688,280 @@ define <8 x half> @stack_fold_mulsh_int(<8 x half> %a0, <8 x half> %a1) { ret <8 x half> %5 } +define <32 x half> @stack_fold_rcpph(<32 x half> %a0) { + ;CHECK-LABEL: stack_fold_rcpph: + ;CHECK: vrcpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1) + ret <32 x half> %2 +} +declare <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half>, <32 x half>, i32) + +define <32 x half> @stack_fold_rcpph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) { + ;CHECK-LABEL: stack_fold_rcpph_mask: + ;CHECK: vrcpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <32 x half>, <32 x half>* %passthru + %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_rcpph_maskz(<32 x half> %a0, i32* %mask) { + ;CHECK-LABEL: stack_fold_rcpph_maskz: + ;CHECK: vrcpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i32, i32* %mask + %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2) + ret <32 x half> %3 +} + +define <8 x half> @stack_fold_rcpsh(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_rcpsh: + ;CHECK: vrcpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half>, <8 x half>, <8 x half>, i8) + +define <8 x half> @stack_fold_rcpsh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_rcpsh_mask: + ;CHECK: vrcpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x half>, <8 x half>* %passthru + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_rcpsh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) { + ;CHECK-LABEL: stack_fold_rcpsh_maskz: + ;CHECK: vrcpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2) + ret <8 x half> %3 +} + +define <32 x half> @stack_fold_reduceph(<32 x half> %a0) { + ;CHECK-LABEL: stack_fold_reduceph: + ;CHECK: vreduceph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4) + ret <32 x half> %2 +} +declare <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half>, i32, <32 x half>, i32, i32) + +define <32 x half> @stack_fold_reduceph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) { + ;CHECK-LABEL: stack_fold_reduceph_mask: + ;CHECK: vreduceph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <32 x half>, <32 x half>* %passthru + %3 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_reduceph_maskz(<32 x half> %a0, i32* %mask) { + ;CHECK-LABEL: stack_fold_reduceph_maskz: + ;CHECK: vreduceph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i32, i32* %mask + %3 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4) + ret <32 x half> %3 +} + +define <8 x half> @stack_fold_reducesh(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_reducesh: + ;CHECK: vreducesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 8, i32 4) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32, i32) + +define <8 x half> @stack_fold_reducesh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_reducesh_mask: + ;CHECK: vreducesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x half>, <8 x half>* %passthru + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 8, i32 4) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_reducesh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) { + ;CHECK-LABEL: stack_fold_reducesh_maskz: + ;CHECK: vreducesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 8, i32 4) + ret <8 x half> %3 +} + +define <32 x half> @stack_fold_rndscaleph(<32 x half> %a0) { + ;CHECK-LABEL: stack_fold_rndscaleph: + ;CHECK: vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4) + ret <32 x half> %2 +} +declare <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half>, i32, <32 x half>, i32, i32) + +define <32 x half> @stack_fold_rndscaleph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) { + ;CHECK-LABEL: stack_fold_rndscaleph_mask: + ;CHECK: vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <32 x half>, <32 x half>* %passthru + %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_rndscaleph_maskz(<32 x half> %a0, i32* %mask) { + ;CHECK-LABEL: stack_fold_rndscaleph_maskz: + ;CHECK: vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i32, i32* %mask + %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4) + ret <32 x half> %3 +} + +define <8 x half> @stack_fold_rndscalesh(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_rndscalesh: + ;CHECK: vrndscalesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 8, i32 4) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32, i32) + +define <8 x half> @stack_fold_rndscalesh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_rndscalesh_mask: + ;CHECK: vrndscalesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x half>, <8 x half>* %passthru + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 8, i32 4) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_rndscalesh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) { + ;CHECK-LABEL: stack_fold_rndscalesh_maskz: + ;CHECK: vrndscalesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 8, i32 4) + ret <8 x half> %3 +} + +define <32 x half> @stack_fold_rsqrtph(<32 x half> %a0) { + ;CHECK-LABEL: stack_fold_rsqrtph: + ;CHECK: vrsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1) + ret <32 x half> %2 +} +declare <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half>, <32 x half>, i32) + +define <32 x half> @stack_fold_rsqrtph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) { + ;CHECK-LABEL: stack_fold_rsqrtph_mask: + ;CHECK: vrsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <32 x half>, <32 x half>* %passthru + %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask) + ret <32 x half> %3 +} + +define <32 x half> @stack_fold_rsqrtph_maskz(<32 x half> %a0, i32* %mask) { + ;CHECK-LABEL: stack_fold_rsqrtph_maskz: + ;CHECK: vrsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i32, i32* %mask + %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2) + ret <32 x half> %3 +} + +define <8 x half> @stack_fold_rsqrtsh(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_rsqrtsh: + ;CHECK: vrsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8) + +define <8 x half> @stack_fold_rsqrtsh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_rsqrtsh_mask: + ;CHECK: vrsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x half>, <8 x half>* %passthru + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_rsqrtsh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) { + ;CHECK-LABEL: stack_fold_rsqrtsh_maskz: + ;CHECK: vrsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2) + ret <8 x half> %3 +} + +define <32 x half> @stack_fold_sqrtph(<32 x half> %a0) { + ;CHECK-LABEL: stack_fold_sqrtph: + ;CHECK: vsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0) + ret <32 x half> %2 +} +declare <32 x half> @llvm.sqrt.v32f16(<32 x half>) + +define <32 x half> @stack_fold_sqrtph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) { + ;CHECK-LABEL: stack_fold_sqrtph_mask: + ;CHECK: vsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <32 x half>, <32 x half>* %passthru + %3 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0) + %4 = bitcast i32 %mask to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %3, <32 x half> %2 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_sqrtph_maskz(<32 x half> %a0, i32* %mask) { + ;CHECK-LABEL: stack_fold_sqrtph_maskz: + ;CHECK: vsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i32, i32* %mask + %3 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0) + %4 = bitcast i32 %2 to <32 x i1> + %5 = select <32 x i1> %4, <32 x half> %3, <32 x half> zeroinitializer + ret <32 x half> %5 +} + +define <8 x half> @stack_fold_sqrtsh(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_sqrtsh: + ;CHECK: vsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32) + +define <8 x half> @stack_fold_sqrtsh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_sqrtsh_mask: + ;CHECK: vsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x half>, <8 x half>* %passthru + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_sqrtsh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) { + ;CHECK-LABEL: stack_fold_sqrtsh_maskz: + ;CHECK: vsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 4) + ret <8 x half> %3 +} + define <32 x half> @stack_fold_subph_zmm(<32 x half> %a0, <32 x half> %a1) { ;CHECK-LABEL: stack_fold_subph_zmm ;CHECK: vsubph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll index fab7059dd959dd..92cb57f27b9aba 100644 --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll @@ -60,6 +60,156 @@ define <16 x half> @stack_fold_divph_ymm(<16 x half> %a0, <16 x half> %a1) { ret <16 x half> %2 } +define i8 @stack_fold_fpclassph(<8 x half> %a0) { + ;CHECK-LABEL: stack_fold_fpclassph: + ;CHECK: fpclassphx $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half> %a0, i32 4) + %3 = bitcast <8 x i1> %2 to i8 + ret i8 %3 +} +declare <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half>, i32) + +define i8 @stack_fold_fpclassph_mask(<8 x half> %a0, <8 x i1>* %p) { + ;CHECK-LABEL: stack_fold_fpclassph_mask: + ;CHECK: fpclassphx $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half> %a0, i32 4) + %mask = load <8 x i1>, <8 x i1>* %p + %3 = and <8 x i1> %2, %mask + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define i16 @stack_fold_fpclassph_ymm(<16 x half> %a0) { + ;CHECK-LABEL: stack_fold_fpclassph_ymm: + ;CHECK: fpclassphy $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half> %a0, i32 4) + %3 = bitcast <16 x i1> %2 to i16 + ret i16 %3 +} +declare <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half>, i32) + +define i16 @stack_fold_fpclassph_mask_ymm(<16 x half> %a0, <16 x i1>* %p) { + ;CHECK-LABEL: stack_fold_fpclassph_mask_ymm: + ;CHECK: fpclassphy $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half> %a0, i32 4) + %mask = load <16 x i1>, <16 x i1>* %p + %3 = and <16 x i1> %2, %mask + %4 = bitcast <16 x i1> %3 to i16 + ret i16 %4 +} + +define <8 x half> @stack_fold_getexpph(<8 x half> %a0) { + ;CHECK-LABEL: stack_fold_getexpph: + ;CHECK: vgetexpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %a0, <8 x half> undef, i8 -1) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half>, <8 x half>, i8) + +define <8 x half> @stack_fold_getexpph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_getexpph_mask: + ;CHECK: vgetexpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x half>, <8 x half>* %passthru + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %a0, <8 x half> %2, i8 %mask) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_getexpph_maskz(<8 x half> %a0, i8* %mask) { + ;CHECK-LABEL: stack_fold_getexpph_maskz: + ;CHECK: vgetexpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 %2) + ret <8 x half> %3 +} + +define <16 x half> @stack_fold_getexpph_ymm(<16 x half> %a0) { + ;CHECK-LABEL: stack_fold_getexpph_ymm: + ;CHECK: vgetexpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %a0, <16 x half> undef, i16 -1) + ret <16 x half> %2 +} +declare <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half>, <16 x half>, i16) + +define <16 x half> @stack_fold_getexpph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) { + ;CHECK-LABEL: stack_fold_getexpph_mask_ymm: + ;CHECK: vgetexpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <16 x half>, <16 x half>* %passthru + %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %a0, <16 x half> %2, i16 %mask) + ret <16 x half> %3 +} + +define <16 x half> @stack_fold_getexpph_maskz_ymm(<16 x half> %a0, i16* %mask) { + ;CHECK-LABEL: stack_fold_getexpph_maskz_ymm: + ;CHECK: vgetexpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i16, i16* %mask + %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 %2) + ret <16 x half> %3 +} + +define <8 x half> @stack_fold_getmantph(<8 x half> %a0) { + ;CHECK-LABEL: stack_fold_getmantph: + ;CHECK: vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %a0, i32 8, <8 x half> undef, i8 -1) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half>, i32, <8 x half>, i8) + +define <8 x half> @stack_fold_getmantph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_getmantph_mask: + ;CHECK: vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x half>, <8 x half>* %passthru + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %a0, i32 8, <8 x half> %2, i8 %mask) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_getmantph_maskz(<8 x half> %a0, i8* %mask) { + ;CHECK-LABEL: stack_fold_getmantph_maskz: + ;CHECK: vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %a0, i32 8, <8 x half> zeroinitializer, i8 %2) + ret <8 x half> %3 +} + +define <16 x half> @stack_fold_getmantph_ymm(<16 x half> %a0) { + ;CHECK-LABEL: stack_fold_getmantph_ymm: + ;CHECK: vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %a0, i32 8, <16 x half> undef, i16 -1) + ret <16 x half> %2 +} +declare <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half>, i32, <16 x half>, i16) + +define <16 x half> @stack_fold_getmantph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) { + ;CHECK-LABEL: stack_fold_getmantph_mask_ymm: + ;CHECK: vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <16 x half>, <16 x half>* %passthru + %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %a0, i32 8, <16 x half> %2, i16 %mask) + ret <16 x half> %3 +} + +define <16 x half> @stack_fold_getmantph_maskz_ymm(<16 x half> %a0, i16* %mask) { + ;CHECK-LABEL: stack_fold_getmantph_maskz_ymm: + ;CHECK: vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i16, i16* %mask + %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %a0, i32 8, <16 x half> zeroinitializer, i16 %2) + ret <16 x half> %3 +} + define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) #0 { ;CHECK-LABEL: stack_fold_maxph ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload @@ -144,6 +294,284 @@ define <16 x half> @stack_fold_mulph_ymm(<16 x half> %a0, <16 x half> %a1) { ret <16 x half> %2 } +define <8 x half> @stack_fold_rcpph(<8 x half> %a0) { + ;CHECK-LABEL: stack_fold_rcpph: + ;CHECK: vrcpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> undef, i8 -1) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half>, <8 x half>, i8) + +define <8 x half> @stack_fold_rcpph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_rcpph_mask: + ;CHECK: vrcpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x half>, <8 x half>* %passthru + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> %2, i8 %mask) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_rcpph_maskz(<8 x half> %a0, i8* %mask) { + ;CHECK-LABEL: stack_fold_rcpph_maskz: + ;CHECK: vrcpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 %2) + ret <8 x half> %3 +} + +define <16 x half> @stack_fold_rcpph_ymm(<16 x half> %a0) { + ;CHECK-LABEL: stack_fold_rcpph_ymm: + ;CHECK: vrcpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> undef, i16 -1) + ret <16 x half> %2 +} +declare <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half>, <16 x half>, i16) + +define <16 x half> @stack_fold_rcpph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) { + ;CHECK-LABEL: stack_fold_rcpph_mask_ymm: + ;CHECK: vrcpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <16 x half>, <16 x half>* %passthru + %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> %2, i16 %mask) + ret <16 x half> %3 +} + +define <16 x half> @stack_fold_rcpph_maskz_ymm(<16 x half> %a0, i16* %mask) { + ;CHECK-LABEL: stack_fold_rcpph_maskz_ymm: + ;CHECK: vrcpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i16, i16* %mask + %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 %2) + ret <16 x half> %3 +} + +define <8 x half> @stack_fold_reduceph(<8 x half> %a0) { + ;CHECK-LABEL: stack_fold_reduceph: + ;CHECK: vreduceph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %a0, i32 8, <8 x half> undef, i8 -1) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half>, i32, <8 x half>, i8) + +define <8 x half> @stack_fold_reduceph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_reduceph_mask: + ;CHECK: vreduceph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x half>, <8 x half>* %passthru + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %a0, i32 8, <8 x half> %2, i8 %mask) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_reduceph_maskz(<8 x half> %a0, i8* %mask) { + ;CHECK-LABEL: stack_fold_reduceph_maskz: + ;CHECK: vreduceph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %a0, i32 8, <8 x half> zeroinitializer, i8 %2) + ret <8 x half> %3 +} + +define <16 x half> @stack_fold_reduceph_ymm(<16 x half> %a0) { + ;CHECK-LABEL: stack_fold_reduceph_ymm: + ;CHECK: vreduceph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %a0, i32 8, <16 x half> undef, i16 -1) + ret <16 x half> %2 +} +declare <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half>, i32, <16 x half>, i16) + +define <16 x half> @stack_fold_reduceph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) { + ;CHECK-LABEL: stack_fold_reduceph_mask_ymm: + ;CHECK: vreduceph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <16 x half>, <16 x half>* %passthru + %3 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %a0, i32 8, <16 x half> %2, i16 %mask) + ret <16 x half> %3 +} + +define <16 x half> @stack_fold_reduceph_maskz_ymm(<16 x half> %a0, i16* %mask) { + ;CHECK-LABEL: stack_fold_reduceph_maskz_ymm: + ;CHECK: vreduceph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i16, i16* %mask + %3 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %a0, i32 8, <16 x half> zeroinitializer, i16 %2) + ret <16 x half> %3 +} + +define <8 x half> @stack_fold_rndscaleph(<8 x half> %a0) { + ;CHECK-LABEL: stack_fold_rndscaleph: + ;CHECK: vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %a0, i32 8, <8 x half> undef, i8 -1) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half>, i32, <8 x half>, i8) + +define <8 x half> @stack_fold_rndscaleph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_rndscaleph_mask: + ;CHECK: vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x half>, <8 x half>* %passthru + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %a0, i32 8, <8 x half> %2, i8 %mask) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_rndscaleph_maskz(<8 x half> %a0, i8* %mask) { + ;CHECK-LABEL: stack_fold_rndscaleph_maskz: + ;CHECK: vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %a0, i32 8, <8 x half> zeroinitializer, i8 %2) + ret <8 x half> %3 +} + +define <16 x half> @stack_fold_rndscaleph_ymm(<16 x half> %a0) { + ;CHECK-LABEL: stack_fold_rndscaleph_ymm: + ;CHECK: vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %a0, i32 8, <16 x half> undef, i16 -1) + ret <16 x half> %2 +} +declare <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half>, i32, <16 x half>, i16) + +define <16 x half> @stack_fold_rndscaleph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) { + ;CHECK-LABEL: stack_fold_rndscaleph_mask_ymm: + ;CHECK: vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <16 x half>, <16 x half>* %passthru + %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %a0, i32 8, <16 x half> %2, i16 %mask) + ret <16 x half> %3 +} + +define <16 x half> @stack_fold_rndscaleph_maskz_ymm(<16 x half> %a0, i16* %mask) { + ;CHECK-LABEL: stack_fold_rndscaleph_maskz_ymm: + ;CHECK: vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i16, i16* %mask + %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %a0, i32 8, <16 x half> zeroinitializer, i16 %2) + ret <16 x half> %3 +} + +define <8 x half> @stack_fold_rsqrtph(<8 x half> %a0) { + ;CHECK-LABEL: stack_fold_rsqrtph: + ;CHECK: vrsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> undef, i8 -1) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half>, <8 x half>, i8) + +define <8 x half> @stack_fold_rsqrtph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_rsqrtph_mask: + ;CHECK: vrsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x half>, <8 x half>* %passthru + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> %2, i8 %mask) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_rsqrtph_maskz(<8 x half> %a0, i8* %mask) { + ;CHECK-LABEL: stack_fold_rsqrtph_maskz: + ;CHECK: vrsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 %2) + ret <8 x half> %3 +} + +define <16 x half> @stack_fold_rsqrtph_ymm(<16 x half> %a0) { + ;CHECK-LABEL: stack_fold_rsqrtph_ymm: + ;CHECK: vrsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> undef, i16 -1) + ret <16 x half> %2 +} +declare <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half>, <16 x half>, i16) + +define <16 x half> @stack_fold_rsqrtph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) { + ;CHECK-LABEL: stack_fold_rsqrtph_mask_ymm: + ;CHECK: vrsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <16 x half>, <16 x half>* %passthru + %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> %2, i16 %mask) + ret <16 x half> %3 +} + +define <16 x half> @stack_fold_rsqrtph_maskz_ymm(<16 x half> %a0, i16* %mask) { + ;CHECK-LABEL: stack_fold_rsqrtph_maskz_ymm: + ;CHECK: vrsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i16, i16* %mask + %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 %2) + ret <16 x half> %3 +} + +define <8 x half> @stack_fold_sqrtph(<8 x half> %a0) { + ;CHECK-LABEL: stack_fold_sqrtph: + ;CHECK: vsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0) + ret <8 x half> %2 +} +declare <8 x half> @llvm.sqrt.v8f16(<8 x half>) + +define <8 x half> @stack_fold_sqrtph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) { + ;CHECK-LABEL: stack_fold_sqrtph_mask: + ;CHECK: vsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x half>, <8 x half>* %passthru + %3 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0) + %4 = bitcast i8 %mask to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %3, <8 x half> %2 + ret <8 x half> %5 +} + +define <8 x half> @stack_fold_sqrtph_maskz(<8 x half> %a0, i8* %mask) { + ;CHECK-LABEL: stack_fold_sqrtph_maskz: + ;CHECK: vsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i8, i8* %mask + %3 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0) + %4 = bitcast i8 %2 to <8 x i1> + %5 = select <8 x i1> %4, <8 x half> %3, <8 x half> zeroinitializer + ret <8 x half> %5 +} + +define <16 x half> @stack_fold_sqrtph_ymm(<16 x half> %a0) { + ;CHECK-LABEL: stack_fold_sqrtph_ymm: + ;CHECK: vsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0) + ret <16 x half> %2 +} +declare <16 x half> @llvm.sqrt.v16f16(<16 x half>) + +define <16 x half> @stack_fold_sqrtph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) { + ;CHECK-LABEL: stack_fold_sqrtph_mask_ymm: + ;CHECK: vsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <16 x half>, <16 x half>* %passthru + %3 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0) + %4 = bitcast i16 %mask to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %3, <16 x half> %2 + ret <16 x half> %5 +} + +define <16 x half> @stack_fold_sqrtph_maskz_ymm(<16 x half> %a0, i16* %mask) { + ;CHECK-LABEL: stack_fold_sqrtph_maskz_ymm: + ;CHECK: vsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load i16, i16* %mask + %3 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0) + %4 = bitcast i16 %2 to <16 x i1> + %5 = select <16 x i1> %4, <16 x half> %3, <16 x half> zeroinitializer + ret <16 x half> %5 +} + define <8 x half> @stack_fold_subph(<8 x half> %a0, <8 x half> %a1) { ;CHECK-LABEL: stack_fold_subph ;CHECK: vsubph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll index 7171ac32336d36..f73742947b5ccb 100644 --- a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll @@ -6,6 +6,7 @@ declare <8 x half> @llvm.experimental.constrained.fadd.v8f16(<8 x half>, <8 x ha declare <8 x half> @llvm.experimental.constrained.fsub.v8f16(<8 x half>, <8 x half>, metadata, metadata) declare <8 x half> @llvm.experimental.constrained.fmul.v8f16(<8 x half>, <8 x half>, metadata, metadata) declare <8 x half> @llvm.experimental.constrained.fdiv.v8f16(<8 x half>, <8 x half>, metadata, metadata) +declare <8 x half> @llvm.experimental.constrained.sqrt.v8f16(<8 x half>, metadata, metadata) declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) declare half @llvm.experimental.constrained.fptrunc.f16.f64(double, metadata, metadata) declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) @@ -61,6 +62,18 @@ define <8 x half> @f8(<8 x half> %a, <8 x half> %b) #0 { ret <8 x half> %ret } +define <8 x half> @f10(<8 x half> %a) #0 { +; CHECK-LABEL: f10: +; CHECK: # %bb.0: +; CHECK-NEXT: vsqrtph %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %sqrt = call <8 x half> @llvm.experimental.constrained.sqrt.v8f16( + <8 x half> %a, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half > %sqrt +} + define <8 x half> @f11(<2 x double> %a0, <8 x half> %a1) #0 { ; CHECK-LABEL: f11: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll index 8b78a5b5c492c2..d5868287823fb3 100644 --- a/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll @@ -6,10 +6,16 @@ declare <16 x half> @llvm.experimental.constrained.fadd.v16f16(<16 x half>, <16 declare <16 x half> @llvm.experimental.constrained.fsub.v16f16(<16 x half>, <16 x half>, metadata, metadata) declare <16 x half> @llvm.experimental.constrained.fmul.v16f16(<16 x half>, <16 x half>, metadata, metadata) declare <16 x half> @llvm.experimental.constrained.fdiv.v16f16(<16 x half>, <16 x half>, metadata, metadata) +declare <16 x half> @llvm.experimental.constrained.sqrt.v16f16(<16 x half>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16(<4 x half>, metadata) declare <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half>, metadata) declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f64(<4 x double>, metadata, metadata) declare <8 x half> @llvm.experimental.constrained.fptrunc.v8f16.v8f32(<8 x float>, metadata, metadata) +declare <16 x half> @llvm.experimental.constrained.ceil.v16f16(<16 x half>, metadata) +declare <16 x half> @llvm.experimental.constrained.floor.v16f16(<16 x half>, metadata) +declare <16 x half> @llvm.experimental.constrained.trunc.v16f16(<16 x half>, metadata) +declare <16 x half> @llvm.experimental.constrained.rint.v16f16(<16 x half>, metadata, metadata) +declare <16 x half> @llvm.experimental.constrained.nearbyint.v16f16(<16 x half>, metadata, metadata) define <16 x half> @f2(<16 x half> %a, <16 x half> %b) #0 { ; CHECK-LABEL: f2: @@ -55,6 +61,19 @@ define <16 x half> @f8(<16 x half> %a, <16 x half> %b) #0 { ret <16 x half> %ret } + +define <16 x half> @f10(<16 x half> %a) #0 { +; CHECK-LABEL: f10: +; CHECK: # %bb.0: +; CHECK-NEXT: vsqrtph %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x half> @llvm.experimental.constrained.sqrt.v16f16( + <16 x half> %a, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half > %ret +} + define <4 x double> @f11(<4 x half> %a) #0 { ; CHECK-LABEL: f11: ; CHECK: # %bb.0: @@ -103,4 +122,57 @@ define <8 x half> @f15(<8 x float> %a) #0 { ret <8 x half> %ret } +define <16 x half> @fceilv16f16(<16 x half> %f) #0 { +; CHECK-LABEL: fceilv16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vrndscaleph $10, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <16 x half> @llvm.experimental.constrained.ceil.v16f16( + <16 x half> %f, metadata !"fpexcept.strict") #0 + ret <16 x half> %res +} + +define <16 x half> @ffloorv16f16(<16 x half> %f) #0 { +; CHECK-LABEL: ffloorv16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vrndscaleph $9, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <16 x half> @llvm.experimental.constrained.floor.v16f16( + <16 x half> %f, metadata !"fpexcept.strict") #0 + ret <16 x half> %res +} + + +define <16 x half> @ftruncv16f16(<16 x half> %f) #0 { +; CHECK-LABEL: ftruncv16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vrndscaleph $11, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <16 x half> @llvm.experimental.constrained.trunc.v16f16( + <16 x half> %f, metadata !"fpexcept.strict") #0 + ret <16 x half> %res +} + +define <16 x half> @frintv16f16(<16 x half> %f) #0 { +; CHECK-LABEL: frintv16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vrndscaleph $4, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <16 x half> @llvm.experimental.constrained.rint.v16f16( + <16 x half> %f, + metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret <16 x half> %res +} + +define <16 x half> @fnearbyintv16f16(<16 x half> %f) #0 { +; CHECK-LABEL: fnearbyintv16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vrndscaleph $12, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <16 x half> @llvm.experimental.constrained.nearbyint.v16f16( + <16 x half> %f, + metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret <16 x half> %res +} + attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll index 0a25d1c9d3d01b..6273a525b15d66 100644 --- a/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll @@ -6,10 +6,16 @@ declare <32 x half> @llvm.experimental.constrained.fadd.v32f16(<32 x half>, <32 declare <32 x half> @llvm.experimental.constrained.fsub.v32f16(<32 x half>, <32 x half>, metadata, metadata) declare <32 x half> @llvm.experimental.constrained.fmul.v32f16(<32 x half>, <32 x half>, metadata, metadata) declare <32 x half> @llvm.experimental.constrained.fdiv.v32f16(<32 x half>, <32 x half>, metadata, metadata) +declare <32 x half> @llvm.experimental.constrained.sqrt.v32f16(<32 x half>, metadata, metadata) declare <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f16(<8 x half>, metadata) declare <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half>, metadata) declare <8 x half> @llvm.experimental.constrained.fptrunc.v8f16.v8f64(<8 x double>, metadata, metadata) declare <16 x half> @llvm.experimental.constrained.fptrunc.v16f16.v16f32(<16 x float>, metadata, metadata) +declare <32 x half> @llvm.experimental.constrained.ceil.v32f16(<32 x half>, metadata) +declare <32 x half> @llvm.experimental.constrained.floor.v32f16(<32 x half>, metadata) +declare <32 x half> @llvm.experimental.constrained.trunc.v32f16(<32 x half>, metadata) +declare <32 x half> @llvm.experimental.constrained.rint.v32f16(<32 x half>, metadata, metadata) +declare <32 x half> @llvm.experimental.constrained.nearbyint.v32f16(<32 x half>, metadata, metadata) define <32 x half> @f2(<32 x half> %a, <32 x half> %b) #0 { ; CHECK-LABEL: f2: @@ -55,6 +61,18 @@ define <32 x half> @f8(<32 x half> %a, <32 x half> %b) #0 { ret <32 x half> %ret } +define <32 x half> @f10(<32 x half> %a) #0 { +; CHECK-LABEL: f10: +; CHECK: # %bb.0: +; CHECK-NEXT: vsqrtph %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <32 x half> @llvm.experimental.constrained.sqrt.v32f16( + <32 x half> %a, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <32 x half > %ret +} + define <8 x double> @f11(<8 x half> %a) #0 { ; CHECK-LABEL: f11: ; CHECK: # %bb.0: @@ -102,4 +120,51 @@ define <16 x half> @f15(<16 x float> %a) #0 { ret <16 x half> %ret } +define <32 x half> @strict_vector_fceil_v32f16(<32 x half> %f) #0 { +; CHECK-LABEL: strict_vector_fceil_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vrndscaleph $10, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <32 x half> @llvm.experimental.constrained.ceil.v32f16(<32 x half> %f, metadata !"fpexcept.strict") #0 + ret <32 x half> %res +} + +define <32 x half> @strict_vector_ffloor_v32f16(<32 x half> %f) #0 { +; CHECK-LABEL: strict_vector_ffloor_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vrndscaleph $9, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <32 x half> @llvm.experimental.constrained.floor.v32f16(<32 x half> %f, metadata !"fpexcept.strict") #0 + ret <32 x half> %res +} + +define <32 x half> @strict_vector_ftrunc_v32f16(<32 x half> %f) #0 { +; CHECK-LABEL: strict_vector_ftrunc_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vrndscaleph $11, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <32 x half> @llvm.experimental.constrained.trunc.v32f16(<32 x half> %f, metadata !"fpexcept.strict") #0 + ret <32 x half> %res +} + +define <32 x half> @strict_vector_frint_v32f16(<32 x half> %f) #0 { +; CHECK-LABEL: strict_vector_frint_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vrndscaleph $4, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <32 x half> @llvm.experimental.constrained.rint.v32f16(<32 x half> %f, + metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret <32 x half> %res +} + +define <32 x half> @strict_vector_fnearbyint_v32f16(<32 x half> %f) #0 { +; CHECK-LABEL: strict_vector_fnearbyint_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vrndscaleph $12, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <32 x half> @llvm.experimental.constrained.nearbyint.v32f16(<32 x half> %f, + metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret <32 x half> %res +} + attributes #0 = { strictfp } diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16.txt b/llvm/test/MC/Disassembler/X86/avx512fp16.txt index 81154318083352..67514e50b1e12d 100644 --- a/llvm/test/MC/Disassembler/X86/avx512fp16.txt +++ b/llvm/test/MC/Disassembler/X86/avx512fp16.txt @@ -1356,3 +1356,411 @@ # ATT: vcvtw2ph -256(%rdx){1to32}, %zmm30 {%k7} {z} # INTEL: vcvtw2ph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32} 0x62,0x65,0x7e,0xdf,0x7d,0x72,0x80 + +# ATT: vfpclassph $123, %zmm30, %k5 +# INTEL: vfpclassph k5, zmm30, 123 +0x62,0x93,0x7c,0x48,0x66,0xee,0x7b + +# ATT: vfpclassphz $123, 268435456(%rbp,%r14,8), %k5 {%k7} +# INTEL: vfpclassph k5 {k7}, zmmword ptr [rbp + 8*r14 + 268435456], 123 +0x62,0xb3,0x7c,0x4f,0x66,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vfpclassph $123, (%r9){1to32}, %k5 +# INTEL: vfpclassph k5, word ptr [r9]{1to32}, 123 +0x62,0xd3,0x7c,0x58,0x66,0x29,0x7b + +# ATT: vfpclassphz $123, 8128(%rcx), %k5 +# INTEL: vfpclassph k5, zmmword ptr [rcx + 8128], 123 +0x62,0xf3,0x7c,0x48,0x66,0x69,0x7f,0x7b + +# ATT: vfpclassph $123, -256(%rdx){1to32}, %k5 {%k7} +# INTEL: vfpclassph k5 {k7}, word ptr [rdx - 256]{1to32}, 123 +0x62,0xf3,0x7c,0x5f,0x66,0x6a,0x80,0x7b + +# ATT: vfpclasssh $123, %xmm30, %k5 +# INTEL: vfpclasssh k5, xmm30, 123 +0x62,0x93,0x7c,0x08,0x67,0xee,0x7b + +# ATT: vfpclasssh $123, 268435456(%rbp,%r14,8), %k5 {%k7} +# INTEL: vfpclasssh k5 {k7}, word ptr [rbp + 8*r14 + 268435456], 123 +0x62,0xb3,0x7c,0x0f,0x67,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vfpclasssh $123, (%r9), %k5 +# INTEL: vfpclasssh k5, word ptr [r9], 123 +0x62,0xd3,0x7c,0x08,0x67,0x29,0x7b + +# ATT: vfpclasssh $123, 254(%rcx), %k5 +# INTEL: vfpclasssh k5, word ptr [rcx + 254], 123 +0x62,0xf3,0x7c,0x08,0x67,0x69,0x7f,0x7b + +# ATT: vfpclasssh $123, -256(%rdx), %k5 {%k7} +# INTEL: vfpclasssh k5 {k7}, word ptr [rdx - 256], 123 +0x62,0xf3,0x7c,0x0f,0x67,0x6a,0x80,0x7b + +# ATT: vgetexpph %zmm29, %zmm30 +# INTEL: vgetexpph zmm30, zmm29 +0x62,0x06,0x7d,0x48,0x42,0xf5 + +# ATT: vgetexpph {sae}, %zmm29, %zmm30 +# INTEL: vgetexpph zmm30, zmm29, {sae} +0x62,0x06,0x7d,0x18,0x42,0xf5 + +# ATT: vgetexpph 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vgetexpph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x7d,0x4f,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vgetexpph (%r9){1to32}, %zmm30 +# INTEL: vgetexpph zmm30, word ptr [r9]{1to32} +0x62,0x46,0x7d,0x58,0x42,0x31 + +# ATT: vgetexpph 8128(%rcx), %zmm30 +# INTEL: vgetexpph zmm30, zmmword ptr [rcx + 8128] +0x62,0x66,0x7d,0x48,0x42,0x71,0x7f + +# ATT: vgetexpph -256(%rdx){1to32}, %zmm30 {%k7} {z} +# INTEL: vgetexpph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32} +0x62,0x66,0x7d,0xdf,0x42,0x72,0x80 + +# ATT: vgetexpsh %xmm28, %xmm29, %xmm30 +# INTEL: vgetexpsh xmm30, xmm29, xmm28 +0x62,0x06,0x15,0x00,0x43,0xf4 + +# ATT: vgetexpsh {sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vgetexpsh xmm30, xmm29, xmm28, {sae} +0x62,0x06,0x15,0x10,0x43,0xf4 + +# ATT: vgetexpsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vgetexpsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x07,0x43,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vgetexpsh (%r9), %xmm29, %xmm30 +# INTEL: vgetexpsh xmm30, xmm29, word ptr [r9] +0x62,0x46,0x15,0x00,0x43,0x31 + +# ATT: vgetexpsh 254(%rcx), %xmm29, %xmm30 +# INTEL: vgetexpsh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x66,0x15,0x00,0x43,0x71,0x7f + +# ATT: vgetexpsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vgetexpsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x66,0x15,0x87,0x43,0x72,0x80 + +# ATT: vgetmantph $123, %zmm29, %zmm30 +# INTEL: vgetmantph zmm30, zmm29, 123 +0x62,0x03,0x7c,0x48,0x26,0xf5,0x7b + +# ATT: vgetmantph $123, {sae}, %zmm29, %zmm30 +# INTEL: vgetmantph zmm30, zmm29, {sae}, 123 +0x62,0x03,0x7c,0x18,0x26,0xf5,0x7b + +# ATT: vgetmantph $123, 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vgetmantph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456], 123 +0x62,0x23,0x7c,0x4f,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vgetmantph $123, (%r9){1to32}, %zmm30 +# INTEL: vgetmantph zmm30, word ptr [r9]{1to32}, 123 +0x62,0x43,0x7c,0x58,0x26,0x31,0x7b + +# ATT: vgetmantph $123, 8128(%rcx), %zmm30 +# INTEL: vgetmantph zmm30, zmmword ptr [rcx + 8128], 123 +0x62,0x63,0x7c,0x48,0x26,0x71,0x7f,0x7b + +# ATT: vgetmantph $123, -256(%rdx){1to32}, %zmm30 {%k7} {z} +# INTEL: vgetmantph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}, 123 +0x62,0x63,0x7c,0xdf,0x26,0x72,0x80,0x7b + +# ATT: vgetmantsh $123, %xmm28, %xmm29, %xmm30 +# INTEL: vgetmantsh xmm30, xmm29, xmm28, 123 +0x62,0x03,0x14,0x00,0x27,0xf4,0x7b + +# ATT: vgetmantsh $123, {sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vgetmantsh xmm30, xmm29, xmm28, {sae}, 123 +0x62,0x03,0x14,0x10,0x27,0xf4,0x7b + +# ATT: vgetmantsh $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vgetmantsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456], 123 +0x62,0x23,0x14,0x07,0x27,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vgetmantsh $123, (%r9), %xmm29, %xmm30 +# INTEL: vgetmantsh xmm30, xmm29, word ptr [r9], 123 +0x62,0x43,0x14,0x00,0x27,0x31,0x7b + +# ATT: vgetmantsh $123, 254(%rcx), %xmm29, %xmm30 +# INTEL: vgetmantsh xmm30, xmm29, word ptr [rcx + 254], 123 +0x62,0x63,0x14,0x00,0x27,0x71,0x7f,0x7b + +# ATT: vgetmantsh $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vgetmantsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256], 123 +0x62,0x63,0x14,0x87,0x27,0x72,0x80,0x7b + +# ATT: vrcpph %zmm29, %zmm30 +# INTEL: vrcpph zmm30, zmm29 +0x62,0x06,0x7d,0x48,0x4c,0xf5 + +# ATT: vrcpph 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vrcpph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x7d,0x4f,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vrcpph (%r9){1to32}, %zmm30 +# INTEL: vrcpph zmm30, word ptr [r9]{1to32} +0x62,0x46,0x7d,0x58,0x4c,0x31 + +# ATT: vrcpph 8128(%rcx), %zmm30 +# INTEL: vrcpph zmm30, zmmword ptr [rcx + 8128] +0x62,0x66,0x7d,0x48,0x4c,0x71,0x7f + +# ATT: vrcpph -256(%rdx){1to32}, %zmm30 {%k7} {z} +# INTEL: vrcpph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32} +0x62,0x66,0x7d,0xdf,0x4c,0x72,0x80 + +# ATT: vrcpsh %xmm28, %xmm29, %xmm30 +# INTEL: vrcpsh xmm30, xmm29, xmm28 +0x62,0x06,0x15,0x00,0x4d,0xf4 + +# ATT: vrcpsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vrcpsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x07,0x4d,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vrcpsh (%r9), %xmm29, %xmm30 +# INTEL: vrcpsh xmm30, xmm29, word ptr [r9] +0x62,0x46,0x15,0x00,0x4d,0x31 + +# ATT: vrcpsh 254(%rcx), %xmm29, %xmm30 +# INTEL: vrcpsh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x66,0x15,0x00,0x4d,0x71,0x7f + +# ATT: vrcpsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vrcpsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x66,0x15,0x87,0x4d,0x72,0x80 + +# ATT: vreduceph $123, %zmm29, %zmm30 +# INTEL: vreduceph zmm30, zmm29, 123 +0x62,0x03,0x7c,0x48,0x56,0xf5,0x7b + +# ATT: vreduceph $123, {sae}, %zmm29, %zmm30 +# INTEL: vreduceph zmm30, zmm29, {sae}, 123 +0x62,0x03,0x7c,0x18,0x56,0xf5,0x7b + +# ATT: vreduceph $123, 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vreduceph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456], 123 +0x62,0x23,0x7c,0x4f,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vreduceph $123, (%r9){1to32}, %zmm30 +# INTEL: vreduceph zmm30, word ptr [r9]{1to32}, 123 +0x62,0x43,0x7c,0x58,0x56,0x31,0x7b + +# ATT: vreduceph $123, 8128(%rcx), %zmm30 +# INTEL: vreduceph zmm30, zmmword ptr [rcx + 8128], 123 +0x62,0x63,0x7c,0x48,0x56,0x71,0x7f,0x7b + +# ATT: vreduceph $123, -256(%rdx){1to32}, %zmm30 {%k7} {z} +# INTEL: vreduceph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}, 123 +0x62,0x63,0x7c,0xdf,0x56,0x72,0x80,0x7b + +# ATT: vreducesh $123, %xmm28, %xmm29, %xmm30 +# INTEL: vreducesh xmm30, xmm29, xmm28, 123 +0x62,0x03,0x14,0x00,0x57,0xf4,0x7b + +# ATT: vreducesh $123, {sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vreducesh xmm30, xmm29, xmm28, {sae}, 123 +0x62,0x03,0x14,0x10,0x57,0xf4,0x7b + +# ATT: vreducesh $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vreducesh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456], 123 +0x62,0x23,0x14,0x07,0x57,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vreducesh $123, (%r9), %xmm29, %xmm30 +# INTEL: vreducesh xmm30, xmm29, word ptr [r9], 123 +0x62,0x43,0x14,0x00,0x57,0x31,0x7b + +# ATT: vreducesh $123, 254(%rcx), %xmm29, %xmm30 +# INTEL: vreducesh xmm30, xmm29, word ptr [rcx + 254], 123 +0x62,0x63,0x14,0x00,0x57,0x71,0x7f,0x7b + +# ATT: vreducesh $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vreducesh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256], 123 +0x62,0x63,0x14,0x87,0x57,0x72,0x80,0x7b + +# ATT: vrndscaleph $123, %zmm29, %zmm30 +# INTEL: vrndscaleph zmm30, zmm29, 123 +0x62,0x03,0x7c,0x48,0x08,0xf5,0x7b + +# ATT: vrndscaleph $123, {sae}, %zmm29, %zmm30 +# INTEL: vrndscaleph zmm30, zmm29, {sae}, 123 +0x62,0x03,0x7c,0x18,0x08,0xf5,0x7b + +# ATT: vrndscaleph $123, 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vrndscaleph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456], 123 +0x62,0x23,0x7c,0x4f,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vrndscaleph $123, (%r9){1to32}, %zmm30 +# INTEL: vrndscaleph zmm30, word ptr [r9]{1to32}, 123 +0x62,0x43,0x7c,0x58,0x08,0x31,0x7b + +# ATT: vrndscaleph $123, 8128(%rcx), %zmm30 +# INTEL: vrndscaleph zmm30, zmmword ptr [rcx + 8128], 123 +0x62,0x63,0x7c,0x48,0x08,0x71,0x7f,0x7b + +# ATT: vrndscaleph $123, -256(%rdx){1to32}, %zmm30 {%k7} {z} +# INTEL: vrndscaleph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}, 123 +0x62,0x63,0x7c,0xdf,0x08,0x72,0x80,0x7b + +# ATT: vrndscalesh $123, %xmm28, %xmm29, %xmm30 +# INTEL: vrndscalesh xmm30, xmm29, xmm28, 123 +0x62,0x03,0x14,0x00,0x0a,0xf4,0x7b + +# ATT: vrndscalesh $123, {sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vrndscalesh xmm30, xmm29, xmm28, {sae}, 123 +0x62,0x03,0x14,0x10,0x0a,0xf4,0x7b + +# ATT: vrndscalesh $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vrndscalesh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456], 123 +0x62,0x23,0x14,0x07,0x0a,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b + +# ATT: vrndscalesh $123, (%r9), %xmm29, %xmm30 +# INTEL: vrndscalesh xmm30, xmm29, word ptr [r9], 123 +0x62,0x43,0x14,0x00,0x0a,0x31,0x7b + +# ATT: vrndscalesh $123, 254(%rcx), %xmm29, %xmm30 +# INTEL: vrndscalesh xmm30, xmm29, word ptr [rcx + 254], 123 +0x62,0x63,0x14,0x00,0x0a,0x71,0x7f,0x7b + +# ATT: vrndscalesh $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vrndscalesh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256], 123 +0x62,0x63,0x14,0x87,0x0a,0x72,0x80,0x7b + +# ATT: vrsqrtph %zmm29, %zmm30 +# INTEL: vrsqrtph zmm30, zmm29 +0x62,0x06,0x7d,0x48,0x4e,0xf5 + +# ATT: vrsqrtph 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vrsqrtph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x7d,0x4f,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vrsqrtph (%r9){1to32}, %zmm30 +# INTEL: vrsqrtph zmm30, word ptr [r9]{1to32} +0x62,0x46,0x7d,0x58,0x4e,0x31 + +# ATT: vrsqrtph 8128(%rcx), %zmm30 +# INTEL: vrsqrtph zmm30, zmmword ptr [rcx + 8128] +0x62,0x66,0x7d,0x48,0x4e,0x71,0x7f + +# ATT: vrsqrtph -256(%rdx){1to32}, %zmm30 {%k7} {z} +# INTEL: vrsqrtph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32} +0x62,0x66,0x7d,0xdf,0x4e,0x72,0x80 + +# ATT: vrsqrtsh %xmm28, %xmm29, %xmm30 +# INTEL: vrsqrtsh xmm30, xmm29, xmm28 +0x62,0x06,0x15,0x00,0x4f,0xf4 + +# ATT: vrsqrtsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vrsqrtsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x07,0x4f,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vrsqrtsh (%r9), %xmm29, %xmm30 +# INTEL: vrsqrtsh xmm30, xmm29, word ptr [r9] +0x62,0x46,0x15,0x00,0x4f,0x31 + +# ATT: vrsqrtsh 254(%rcx), %xmm29, %xmm30 +# INTEL: vrsqrtsh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x66,0x15,0x00,0x4f,0x71,0x7f + +# ATT: vrsqrtsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vrsqrtsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x66,0x15,0x87,0x4f,0x72,0x80 + +# ATT: vscalefph %zmm28, %zmm29, %zmm30 +# INTEL: vscalefph zmm30, zmm29, zmm28 +0x62,0x06,0x15,0x40,0x2c,0xf4 + +# ATT: vscalefph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vscalefph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x06,0x15,0x10,0x2c,0xf4 + +# ATT: vscalefph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vscalefph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x47,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vscalefph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vscalefph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x46,0x15,0x50,0x2c,0x31 + +# ATT: vscalefph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vscalefph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x66,0x15,0x40,0x2c,0x71,0x7f + +# ATT: vscalefph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vscalefph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x66,0x15,0xd7,0x2c,0x72,0x80 + +# ATT: vscalefsh %xmm28, %xmm29, %xmm30 +# INTEL: vscalefsh xmm30, xmm29, xmm28 +0x62,0x06,0x15,0x00,0x2d,0xf4 + +# ATT: vscalefsh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vscalefsh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x06,0x15,0x10,0x2d,0xf4 + +# ATT: vscalefsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vscalefsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x26,0x15,0x07,0x2d,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vscalefsh (%r9), %xmm29, %xmm30 +# INTEL: vscalefsh xmm30, xmm29, word ptr [r9] +0x62,0x46,0x15,0x00,0x2d,0x31 + +# ATT: vscalefsh 254(%rcx), %xmm29, %xmm30 +# INTEL: vscalefsh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x66,0x15,0x00,0x2d,0x71,0x7f + +# ATT: vscalefsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vscalefsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x66,0x15,0x87,0x2d,0x72,0x80 + +# ATT: vsqrtph %zmm29, %zmm30 +# INTEL: vsqrtph zmm30, zmm29 +0x62,0x05,0x7c,0x48,0x51,0xf5 + +# ATT: vsqrtph {rn-sae}, %zmm29, %zmm30 +# INTEL: vsqrtph zmm30, zmm29, {rn-sae} +0x62,0x05,0x7c,0x18,0x51,0xf5 + +# ATT: vsqrtph 268435456(%rbp,%r14,8), %zmm30 {%k7} +# INTEL: vsqrtph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7c,0x4f,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vsqrtph (%r9){1to32}, %zmm30 +# INTEL: vsqrtph zmm30, word ptr [r9]{1to32} +0x62,0x45,0x7c,0x58,0x51,0x31 + +# ATT: vsqrtph 8128(%rcx), %zmm30 +# INTEL: vsqrtph zmm30, zmmword ptr [rcx + 8128] +0x62,0x65,0x7c,0x48,0x51,0x71,0x7f + +# ATT: vsqrtph -256(%rdx){1to32}, %zmm30 {%k7} {z} +# INTEL: vsqrtph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32} +0x62,0x65,0x7c,0xdf,0x51,0x72,0x80 + +# ATT: vsqrtsh %xmm28, %xmm29, %xmm30 +# INTEL: vsqrtsh xmm30, xmm29, xmm28 +0x62,0x05,0x16,0x00,0x51,0xf4 + +# ATT: vsqrtsh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vsqrtsh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x05,0x16,0x10,0x51,0xf4 + +# ATT: vsqrtsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vsqrtsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x16,0x07,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vsqrtsh (%r9), %xmm29, %xmm30 +# INTEL: vsqrtsh xmm30, xmm29, word ptr [r9] +0x62,0x45,0x16,0x00,0x51,0x31 + +# ATT: vsqrtsh 254(%rcx), %xmm29, %xmm30 +# INTEL: vsqrtsh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x65,0x16,0x00,0x51,0x71,0x7f + +# ATT: vsqrtsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vsqrtsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x65,0x16,0x87,0x51,0x72,0x80 diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt index 63acd5be1946fe..8f480fc13d82f4 100644 --- a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt +++ b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt @@ -1136,3 +1136,359 @@ # ATT: vcvtw2ph -256(%edx){1to16}, %ymm6 {%k7} {z} # INTEL: vcvtw2ph ymm6 {k7} {z}, word ptr [edx - 256]{1to16} 0x62,0xf5,0x7e,0xbf,0x7d,0x72,0x80 + +# ATT: vfpclassph $123, %xmm6, %k5 +# INTEL: vfpclassph k5, xmm6, 123 +0x62,0xf3,0x7c,0x08,0x66,0xee,0x7b + +# ATT: vfpclassph $123, %ymm6, %k5 +# INTEL: vfpclassph k5, ymm6, 123 +0x62,0xf3,0x7c,0x28,0x66,0xee,0x7b + +# ATT: vfpclassphx $123, 268435456(%esp,%esi,8), %k5 {%k7} +# INTEL: vfpclassph k5 {k7}, xmmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x7c,0x0f,0x66,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vfpclassph $123, (%ecx){1to8}, %k5 +# INTEL: vfpclassph k5, word ptr [ecx]{1to8}, 123 +0x62,0xf3,0x7c,0x18,0x66,0x29,0x7b + +# ATT: vfpclassphx $123, 2032(%ecx), %k5 +# INTEL: vfpclassph k5, xmmword ptr [ecx + 2032], 123 +0x62,0xf3,0x7c,0x08,0x66,0x69,0x7f,0x7b + +# ATT: vfpclassph $123, -256(%edx){1to8}, %k5 {%k7} +# INTEL: vfpclassph k5 {k7}, word ptr [edx - 256]{1to8}, 123 +0x62,0xf3,0x7c,0x1f,0x66,0x6a,0x80,0x7b + +# ATT: vfpclassph $123, (%ecx){1to16}, %k5 +# INTEL: vfpclassph k5, word ptr [ecx]{1to16}, 123 +0x62,0xf3,0x7c,0x38,0x66,0x29,0x7b + +# ATT: vfpclassphy $123, 4064(%ecx), %k5 +# INTEL: vfpclassph k5, ymmword ptr [ecx + 4064], 123 +0x62,0xf3,0x7c,0x28,0x66,0x69,0x7f,0x7b + +# ATT: vfpclassph $123, -256(%edx){1to16}, %k5 {%k7} +# INTEL: vfpclassph k5 {k7}, word ptr [edx - 256]{1to16}, 123 +0x62,0xf3,0x7c,0x3f,0x66,0x6a,0x80,0x7b + +# ATT: vgetexpph %xmm5, %xmm6 +# INTEL: vgetexpph xmm6, xmm5 +0x62,0xf6,0x7d,0x08,0x42,0xf5 + +# ATT: vgetexpph %ymm5, %ymm6 +# INTEL: vgetexpph ymm6, ymm5 +0x62,0xf6,0x7d,0x28,0x42,0xf5 + +# ATT: vgetexpph 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vgetexpph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x7d,0x0f,0x42,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vgetexpph (%ecx){1to8}, %xmm6 +# INTEL: vgetexpph xmm6, word ptr [ecx]{1to8} +0x62,0xf6,0x7d,0x18,0x42,0x31 + +# ATT: vgetexpph 2032(%ecx), %xmm6 +# INTEL: vgetexpph xmm6, xmmword ptr [ecx + 2032] +0x62,0xf6,0x7d,0x08,0x42,0x71,0x7f + +# ATT: vgetexpph -256(%edx){1to8}, %xmm6 {%k7} {z} +# INTEL: vgetexpph xmm6 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf6,0x7d,0x9f,0x42,0x72,0x80 + +# ATT: vgetexpph 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vgetexpph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x7d,0x2f,0x42,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vgetexpph (%ecx){1to16}, %ymm6 +# INTEL: vgetexpph ymm6, word ptr [ecx]{1to16} +0x62,0xf6,0x7d,0x38,0x42,0x31 + +# ATT: vgetexpph 4064(%ecx), %ymm6 +# INTEL: vgetexpph ymm6, ymmword ptr [ecx + 4064] +0x62,0xf6,0x7d,0x28,0x42,0x71,0x7f + +# ATT: vgetexpph -256(%edx){1to16}, %ymm6 {%k7} {z} +# INTEL: vgetexpph ymm6 {k7} {z}, word ptr [edx - 256]{1to16} +0x62,0xf6,0x7d,0xbf,0x42,0x72,0x80 + +# ATT: vgetmantph $123, %ymm5, %ymm6 +# INTEL: vgetmantph ymm6, ymm5, 123 +0x62,0xf3,0x7c,0x28,0x26,0xf5,0x7b + +# ATT: vgetmantph $123, %xmm5, %xmm6 +# INTEL: vgetmantph xmm6, xmm5, 123 +0x62,0xf3,0x7c,0x08,0x26,0xf5,0x7b + +# ATT: vgetmantph $123, 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vgetmantph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x7c,0x0f,0x26,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vgetmantph $123, (%ecx){1to8}, %xmm6 +# INTEL: vgetmantph xmm6, word ptr [ecx]{1to8}, 123 +0x62,0xf3,0x7c,0x18,0x26,0x31,0x7b + +# ATT: vgetmantph $123, 2032(%ecx), %xmm6 +# INTEL: vgetmantph xmm6, xmmword ptr [ecx + 2032], 123 +0x62,0xf3,0x7c,0x08,0x26,0x71,0x7f,0x7b + +# ATT: vgetmantph $123, -256(%edx){1to8}, %xmm6 {%k7} {z} +# INTEL: vgetmantph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}, 123 +0x62,0xf3,0x7c,0x9f,0x26,0x72,0x80,0x7b + +# ATT: vgetmantph $123, 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vgetmantph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x7c,0x2f,0x26,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vgetmantph $123, (%ecx){1to16}, %ymm6 +# INTEL: vgetmantph ymm6, word ptr [ecx]{1to16}, 123 +0x62,0xf3,0x7c,0x38,0x26,0x31,0x7b + +# ATT: vgetmantph $123, 4064(%ecx), %ymm6 +# INTEL: vgetmantph ymm6, ymmword ptr [ecx + 4064], 123 +0x62,0xf3,0x7c,0x28,0x26,0x71,0x7f,0x7b + +# ATT: vgetmantph $123, -256(%edx){1to16}, %ymm6 {%k7} {z} +# INTEL: vgetmantph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}, 123 +0x62,0xf3,0x7c,0xbf,0x26,0x72,0x80,0x7b + +# ATT: vrcpph %xmm5, %xmm6 +# INTEL: vrcpph xmm6, xmm5 +0x62,0xf6,0x7d,0x08,0x4c,0xf5 + +# ATT: vrcpph %ymm5, %ymm6 +# INTEL: vrcpph ymm6, ymm5 +0x62,0xf6,0x7d,0x28,0x4c,0xf5 + +# ATT: vrcpph 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vrcpph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x7d,0x0f,0x4c,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vrcpph (%ecx){1to8}, %xmm6 +# INTEL: vrcpph xmm6, word ptr [ecx]{1to8} +0x62,0xf6,0x7d,0x18,0x4c,0x31 + +# ATT: vrcpph 2032(%ecx), %xmm6 +# INTEL: vrcpph xmm6, xmmword ptr [ecx + 2032] +0x62,0xf6,0x7d,0x08,0x4c,0x71,0x7f + +# ATT: vrcpph -256(%edx){1to8}, %xmm6 {%k7} {z} +# INTEL: vrcpph xmm6 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf6,0x7d,0x9f,0x4c,0x72,0x80 + +# ATT: vrcpph 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vrcpph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x7d,0x2f,0x4c,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vrcpph (%ecx){1to16}, %ymm6 +# INTEL: vrcpph ymm6, word ptr [ecx]{1to16} +0x62,0xf6,0x7d,0x38,0x4c,0x31 + +# ATT: vrcpph 4064(%ecx), %ymm6 +# INTEL: vrcpph ymm6, ymmword ptr [ecx + 4064] +0x62,0xf6,0x7d,0x28,0x4c,0x71,0x7f + +# ATT: vrcpph -256(%edx){1to16}, %ymm6 {%k7} {z} +# INTEL: vrcpph ymm6 {k7} {z}, word ptr [edx - 256]{1to16} +0x62,0xf6,0x7d,0xbf,0x4c,0x72,0x80 + +# ATT: vreduceph $123, %ymm5, %ymm6 +# INTEL: vreduceph ymm6, ymm5, 123 +0x62,0xf3,0x7c,0x28,0x56,0xf5,0x7b + +# ATT: vreduceph $123, %xmm5, %xmm6 +# INTEL: vreduceph xmm6, xmm5, 123 +0x62,0xf3,0x7c,0x08,0x56,0xf5,0x7b + +# ATT: vreduceph $123, 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vreduceph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x7c,0x0f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vreduceph $123, (%ecx){1to8}, %xmm6 +# INTEL: vreduceph xmm6, word ptr [ecx]{1to8}, 123 +0x62,0xf3,0x7c,0x18,0x56,0x31,0x7b + +# ATT: vreduceph $123, 2032(%ecx), %xmm6 +# INTEL: vreduceph xmm6, xmmword ptr [ecx + 2032], 123 +0x62,0xf3,0x7c,0x08,0x56,0x71,0x7f,0x7b + +# ATT: vreduceph $123, -256(%edx){1to8}, %xmm6 {%k7} {z} +# INTEL: vreduceph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}, 123 +0x62,0xf3,0x7c,0x9f,0x56,0x72,0x80,0x7b + +# ATT: vreduceph $123, 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vreduceph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x7c,0x2f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vreduceph $123, (%ecx){1to16}, %ymm6 +# INTEL: vreduceph ymm6, word ptr [ecx]{1to16}, 123 +0x62,0xf3,0x7c,0x38,0x56,0x31,0x7b + +# ATT: vreduceph $123, 4064(%ecx), %ymm6 +# INTEL: vreduceph ymm6, ymmword ptr [ecx + 4064], 123 +0x62,0xf3,0x7c,0x28,0x56,0x71,0x7f,0x7b + +# ATT: vreduceph $123, -256(%edx){1to16}, %ymm6 {%k7} {z} +# INTEL: vreduceph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}, 123 +0x62,0xf3,0x7c,0xbf,0x56,0x72,0x80,0x7b + +# ATT: vrndscaleph $123, %ymm5, %ymm6 +# INTEL: vrndscaleph ymm6, ymm5, 123 +0x62,0xf3,0x7c,0x28,0x08,0xf5,0x7b + +# ATT: vrndscaleph $123, %xmm5, %xmm6 +# INTEL: vrndscaleph xmm6, xmm5, 123 +0x62,0xf3,0x7c,0x08,0x08,0xf5,0x7b + +# ATT: vrndscaleph $123, 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vrndscaleph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x7c,0x0f,0x08,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vrndscaleph $123, (%ecx){1to8}, %xmm6 +# INTEL: vrndscaleph xmm6, word ptr [ecx]{1to8}, 123 +0x62,0xf3,0x7c,0x18,0x08,0x31,0x7b + +# ATT: vrndscaleph $123, 2032(%ecx), %xmm6 +# INTEL: vrndscaleph xmm6, xmmword ptr [ecx + 2032], 123 +0x62,0xf3,0x7c,0x08,0x08,0x71,0x7f,0x7b + +# ATT: vrndscaleph $123, -256(%edx){1to8}, %xmm6 {%k7} {z} +# INTEL: vrndscaleph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}, 123 +0x62,0xf3,0x7c,0x9f,0x08,0x72,0x80,0x7b + +# ATT: vrndscaleph $123, 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vrndscaleph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456], 123 +0x62,0xf3,0x7c,0x2f,0x08,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b + +# ATT: vrndscaleph $123, (%ecx){1to16}, %ymm6 +# INTEL: vrndscaleph ymm6, word ptr [ecx]{1to16}, 123 +0x62,0xf3,0x7c,0x38,0x08,0x31,0x7b + +# ATT: vrndscaleph $123, 4064(%ecx), %ymm6 +# INTEL: vrndscaleph ymm6, ymmword ptr [ecx + 4064], 123 +0x62,0xf3,0x7c,0x28,0x08,0x71,0x7f,0x7b + +# ATT: vrndscaleph $123, -256(%edx){1to16}, %ymm6 {%k7} {z} +# INTEL: vrndscaleph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}, 123 +0x62,0xf3,0x7c,0xbf,0x08,0x72,0x80,0x7b + +# ATT: vrsqrtph %xmm5, %xmm6 +# INTEL: vrsqrtph xmm6, xmm5 +0x62,0xf6,0x7d,0x08,0x4e,0xf5 + +# ATT: vrsqrtph %ymm5, %ymm6 +# INTEL: vrsqrtph ymm6, ymm5 +0x62,0xf6,0x7d,0x28,0x4e,0xf5 + +# ATT: vrsqrtph 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vrsqrtph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x7d,0x0f,0x4e,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vrsqrtph (%ecx){1to8}, %xmm6 +# INTEL: vrsqrtph xmm6, word ptr [ecx]{1to8} +0x62,0xf6,0x7d,0x18,0x4e,0x31 + +# ATT: vrsqrtph 2032(%ecx), %xmm6 +# INTEL: vrsqrtph xmm6, xmmword ptr [ecx + 2032] +0x62,0xf6,0x7d,0x08,0x4e,0x71,0x7f + +# ATT: vrsqrtph -256(%edx){1to8}, %xmm6 {%k7} {z} +# INTEL: vrsqrtph xmm6 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf6,0x7d,0x9f,0x4e,0x72,0x80 + +# ATT: vrsqrtph 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vrsqrtph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x7d,0x2f,0x4e,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vrsqrtph (%ecx){1to16}, %ymm6 +# INTEL: vrsqrtph ymm6, word ptr [ecx]{1to16} +0x62,0xf6,0x7d,0x38,0x4e,0x31 + +# ATT: vrsqrtph 4064(%ecx), %ymm6 +# INTEL: vrsqrtph ymm6, ymmword ptr [ecx + 4064] +0x62,0xf6,0x7d,0x28,0x4e,0x71,0x7f + +# ATT: vrsqrtph -256(%edx){1to16}, %ymm6 {%k7} {z} +# INTEL: vrsqrtph ymm6 {k7} {z}, word ptr [edx - 256]{1to16} +0x62,0xf6,0x7d,0xbf,0x4e,0x72,0x80 + +# ATT: vscalefph %ymm4, %ymm5, %ymm6 +# INTEL: vscalefph ymm6, ymm5, ymm4 +0x62,0xf6,0x55,0x28,0x2c,0xf4 + +# ATT: vscalefph %xmm4, %xmm5, %xmm6 +# INTEL: vscalefph xmm6, xmm5, xmm4 +0x62,0xf6,0x55,0x08,0x2c,0xf4 + +# ATT: vscalefph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vscalefph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x2f,0x2c,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vscalefph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vscalefph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf6,0x55,0x38,0x2c,0x31 + +# ATT: vscalefph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vscalefph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf6,0x55,0x28,0x2c,0x71,0x7f + +# ATT: vscalefph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vscalefph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf6,0x55,0xbf,0x2c,0x72,0x80 + +# ATT: vscalefph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vscalefph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf6,0x55,0x0f,0x2c,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vscalefph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vscalefph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf6,0x55,0x18,0x2c,0x31 + +# ATT: vscalefph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vscalefph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf6,0x55,0x08,0x2c,0x71,0x7f + +# ATT: vscalefph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vscalefph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf6,0x55,0x9f,0x2c,0x72,0x80 + +# ATT: vsqrtph %xmm5, %xmm6 +# INTEL: vsqrtph xmm6, xmm5 +0x62,0xf5,0x7c,0x08,0x51,0xf5 + +# ATT: vsqrtph %ymm5, %ymm6 +# INTEL: vsqrtph ymm6, ymm5 +0x62,0xf5,0x7c,0x28,0x51,0xf5 + +# ATT: vsqrtph 268435456(%esp,%esi,8), %xmm6 {%k7} +# INTEL: vsqrtph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7c,0x0f,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vsqrtph (%ecx){1to8}, %xmm6 +# INTEL: vsqrtph xmm6, word ptr [ecx]{1to8} +0x62,0xf5,0x7c,0x18,0x51,0x31 + +# ATT: vsqrtph 2032(%ecx), %xmm6 +# INTEL: vsqrtph xmm6, xmmword ptr [ecx + 2032] +0x62,0xf5,0x7c,0x08,0x51,0x71,0x7f + +# ATT: vsqrtph -256(%edx){1to8}, %xmm6 {%k7} {z} +# INTEL: vsqrtph xmm6 {k7} {z}, word ptr [edx - 256]{1to8} +0x62,0xf5,0x7c,0x9f,0x51,0x72,0x80 + +# ATT: vsqrtph 268435456(%esp,%esi,8), %ymm6 {%k7} +# INTEL: vsqrtph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x7c,0x2f,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vsqrtph (%ecx){1to16}, %ymm6 +# INTEL: vsqrtph ymm6, word ptr [ecx]{1to16} +0x62,0xf5,0x7c,0x38,0x51,0x31 + +# ATT: vsqrtph 4064(%ecx), %ymm6 +# INTEL: vsqrtph ymm6, ymmword ptr [ecx + 4064] +0x62,0xf5,0x7c,0x28,0x51,0x71,0x7f + +# ATT: vsqrtph -256(%edx){1to16}, %ymm6 {%k7} {z} +# INTEL: vsqrtph ymm6 {k7} {z}, word ptr [edx - 256]{1to16} +0x62,0xf5,0x7c,0xbf,0x51,0x72,0x80 diff --git a/llvm/test/MC/X86/avx512fp16.s b/llvm/test/MC/X86/avx512fp16.s index 1ca659f29aceae..b358705fbedc8e 100644 --- a/llvm/test/MC/X86/avx512fp16.s +++ b/llvm/test/MC/X86/avx512fp16.s @@ -1355,3 +1355,411 @@ // CHECK: vcvtw2ph -256(%rdx){1to32}, %zmm30 {%k7} {z} // CHECK: encoding: [0x62,0x65,0x7e,0xdf,0x7d,0x72,0x80] vcvtw2ph -256(%rdx){1to32}, %zmm30 {%k7} {z} + +// CHECK: vfpclassph $123, %zmm30, %k5 +// CHECK: encoding: [0x62,0x93,0x7c,0x48,0x66,0xee,0x7b] + vfpclassph $123, %zmm30, %k5 + +// CHECK: vfpclassphz $123, 268435456(%rbp,%r14,8), %k5 {%k7} +// CHECK: encoding: [0x62,0xb3,0x7c,0x4f,0x66,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b] + vfpclassphz $123, 268435456(%rbp,%r14,8), %k5 {%k7} + +// CHECK: vfpclassph $123, (%r9){1to32}, %k5 +// CHECK: encoding: [0x62,0xd3,0x7c,0x58,0x66,0x29,0x7b] + vfpclassph $123, (%r9){1to32}, %k5 + +// CHECK: vfpclassphz $123, 8128(%rcx), %k5 +// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x66,0x69,0x7f,0x7b] + vfpclassphz $123, 8128(%rcx), %k5 + +// CHECK: vfpclassph $123, -256(%rdx){1to32}, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7c,0x5f,0x66,0x6a,0x80,0x7b] + vfpclassph $123, -256(%rdx){1to32}, %k5 {%k7} + +// CHECK: vfpclasssh $123, %xmm30, %k5 +// CHECK: encoding: [0x62,0x93,0x7c,0x08,0x67,0xee,0x7b] + vfpclasssh $123, %xmm30, %k5 + +// CHECK: vfpclasssh $123, 268435456(%rbp,%r14,8), %k5 {%k7} +// CHECK: encoding: [0x62,0xb3,0x7c,0x0f,0x67,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b] + vfpclasssh $123, 268435456(%rbp,%r14,8), %k5 {%k7} + +// CHECK: vfpclasssh $123, (%r9), %k5 +// CHECK: encoding: [0x62,0xd3,0x7c,0x08,0x67,0x29,0x7b] + vfpclasssh $123, (%r9), %k5 + +// CHECK: vfpclasssh $123, 254(%rcx), %k5 +// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x67,0x69,0x7f,0x7b] + vfpclasssh $123, 254(%rcx), %k5 + +// CHECK: vfpclasssh $123, -256(%rdx), %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x67,0x6a,0x80,0x7b] + vfpclasssh $123, -256(%rdx), %k5 {%k7} + +// CHECK: vgetexpph %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x7d,0x48,0x42,0xf5] + vgetexpph %zmm29, %zmm30 + +// CHECK: vgetexpph {sae}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x7d,0x18,0x42,0xf5] + vgetexpph {sae}, %zmm29, %zmm30 + +// CHECK: vgetexpph 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x7d,0x4f,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10] + vgetexpph 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vgetexpph (%r9){1to32}, %zmm30 +// CHECK: encoding: [0x62,0x46,0x7d,0x58,0x42,0x31] + vgetexpph (%r9){1to32}, %zmm30 + +// CHECK: vgetexpph 8128(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x66,0x7d,0x48,0x42,0x71,0x7f] + vgetexpph 8128(%rcx), %zmm30 + +// CHECK: vgetexpph -256(%rdx){1to32}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x7d,0xdf,0x42,0x72,0x80] + vgetexpph -256(%rdx){1to32}, %zmm30 {%k7} {z} + +// CHECK: vgetexpsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0x43,0xf4] + vgetexpsh %xmm28, %xmm29, %xmm30 + +// CHECK: vgetexpsh {sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0x43,0xf4] + vgetexpsh {sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vgetexpsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x07,0x43,0xb4,0xf5,0x00,0x00,0x00,0x10] + vgetexpsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vgetexpsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x00,0x43,0x31] + vgetexpsh (%r9), %xmm29, %xmm30 + +// CHECK: vgetexpsh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x00,0x43,0x71,0x7f] + vgetexpsh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vgetexpsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0x87,0x43,0x72,0x80] + vgetexpsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vgetmantph $123, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x03,0x7c,0x48,0x26,0xf5,0x7b] + vgetmantph $123, %zmm29, %zmm30 + +// CHECK: vgetmantph $123, {sae}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x03,0x7c,0x18,0x26,0xf5,0x7b] + vgetmantph $123, {sae}, %zmm29, %zmm30 + +// CHECK: vgetmantph $123, 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x23,0x7c,0x4f,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vgetmantph $123, 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vgetmantph $123, (%r9){1to32}, %zmm30 +// CHECK: encoding: [0x62,0x43,0x7c,0x58,0x26,0x31,0x7b] + vgetmantph $123, (%r9){1to32}, %zmm30 + +// CHECK: vgetmantph $123, 8128(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x63,0x7c,0x48,0x26,0x71,0x7f,0x7b] + vgetmantph $123, 8128(%rcx), %zmm30 + +// CHECK: vgetmantph $123, -256(%rdx){1to32}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x63,0x7c,0xdf,0x26,0x72,0x80,0x7b] + vgetmantph $123, -256(%rdx){1to32}, %zmm30 {%k7} {z} + +// CHECK: vgetmantsh $123, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x03,0x14,0x00,0x27,0xf4,0x7b] + vgetmantsh $123, %xmm28, %xmm29, %xmm30 + +// CHECK: vgetmantsh $123, {sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x03,0x14,0x10,0x27,0xf4,0x7b] + vgetmantsh $123, {sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vgetmantsh $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x23,0x14,0x07,0x27,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vgetmantsh $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vgetmantsh $123, (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x43,0x14,0x00,0x27,0x31,0x7b] + vgetmantsh $123, (%r9), %xmm29, %xmm30 + +// CHECK: vgetmantsh $123, 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x63,0x14,0x00,0x27,0x71,0x7f,0x7b] + vgetmantsh $123, 254(%rcx), %xmm29, %xmm30 + +// CHECK: vgetmantsh $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x63,0x14,0x87,0x27,0x72,0x80,0x7b] + vgetmantsh $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vrcpph %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x7d,0x48,0x4c,0xf5] + vrcpph %zmm29, %zmm30 + +// CHECK: vrcpph 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x7d,0x4f,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrcpph 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vrcpph (%r9){1to32}, %zmm30 +// CHECK: encoding: [0x62,0x46,0x7d,0x58,0x4c,0x31] + vrcpph (%r9){1to32}, %zmm30 + +// CHECK: vrcpph 8128(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x66,0x7d,0x48,0x4c,0x71,0x7f] + vrcpph 8128(%rcx), %zmm30 + +// CHECK: vrcpph -256(%rdx){1to32}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x7d,0xdf,0x4c,0x72,0x80] + vrcpph -256(%rdx){1to32}, %zmm30 {%k7} {z} + +// CHECK: vrcpsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0x4d,0xf4] + vrcpsh %xmm28, %xmm29, %xmm30 + +// CHECK: vrcpsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x07,0x4d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrcpsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vrcpsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x00,0x4d,0x31] + vrcpsh (%r9), %xmm29, %xmm30 + +// CHECK: vrcpsh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x00,0x4d,0x71,0x7f] + vrcpsh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vrcpsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0x87,0x4d,0x72,0x80] + vrcpsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vreduceph $123, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x03,0x7c,0x48,0x56,0xf5,0x7b] + vreduceph $123, %zmm29, %zmm30 + +// CHECK: vreduceph $123, {sae}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x03,0x7c,0x18,0x56,0xf5,0x7b] + vreduceph $123, {sae}, %zmm29, %zmm30 + +// CHECK: vreduceph $123, 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x23,0x7c,0x4f,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vreduceph $123, 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vreduceph $123, (%r9){1to32}, %zmm30 +// CHECK: encoding: [0x62,0x43,0x7c,0x58,0x56,0x31,0x7b] + vreduceph $123, (%r9){1to32}, %zmm30 + +// CHECK: vreduceph $123, 8128(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x63,0x7c,0x48,0x56,0x71,0x7f,0x7b] + vreduceph $123, 8128(%rcx), %zmm30 + +// CHECK: vreduceph $123, -256(%rdx){1to32}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x63,0x7c,0xdf,0x56,0x72,0x80,0x7b] + vreduceph $123, -256(%rdx){1to32}, %zmm30 {%k7} {z} + +// CHECK: vreducesh $123, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x03,0x14,0x00,0x57,0xf4,0x7b] + vreducesh $123, %xmm28, %xmm29, %xmm30 + +// CHECK: vreducesh $123, {sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x03,0x14,0x10,0x57,0xf4,0x7b] + vreducesh $123, {sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vreducesh $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x23,0x14,0x07,0x57,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vreducesh $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vreducesh $123, (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x43,0x14,0x00,0x57,0x31,0x7b] + vreducesh $123, (%r9), %xmm29, %xmm30 + +// CHECK: vreducesh $123, 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x63,0x14,0x00,0x57,0x71,0x7f,0x7b] + vreducesh $123, 254(%rcx), %xmm29, %xmm30 + +// CHECK: vreducesh $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x63,0x14,0x87,0x57,0x72,0x80,0x7b] + vreducesh $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vrndscaleph $123, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x03,0x7c,0x48,0x08,0xf5,0x7b] + vrndscaleph $123, %zmm29, %zmm30 + +// CHECK: vrndscaleph $123, {sae}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x03,0x7c,0x18,0x08,0xf5,0x7b] + vrndscaleph $123, {sae}, %zmm29, %zmm30 + +// CHECK: vrndscaleph $123, 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x23,0x7c,0x4f,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vrndscaleph $123, 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vrndscaleph $123, (%r9){1to32}, %zmm30 +// CHECK: encoding: [0x62,0x43,0x7c,0x58,0x08,0x31,0x7b] + vrndscaleph $123, (%r9){1to32}, %zmm30 + +// CHECK: vrndscaleph $123, 8128(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x63,0x7c,0x48,0x08,0x71,0x7f,0x7b] + vrndscaleph $123, 8128(%rcx), %zmm30 + +// CHECK: vrndscaleph $123, -256(%rdx){1to32}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x63,0x7c,0xdf,0x08,0x72,0x80,0x7b] + vrndscaleph $123, -256(%rdx){1to32}, %zmm30 {%k7} {z} + +// CHECK: vrndscalesh $123, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x03,0x14,0x00,0x0a,0xf4,0x7b] + vrndscalesh $123, %xmm28, %xmm29, %xmm30 + +// CHECK: vrndscalesh $123, {sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x03,0x14,0x10,0x0a,0xf4,0x7b] + vrndscalesh $123, {sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vrndscalesh $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x23,0x14,0x07,0x0a,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vrndscalesh $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vrndscalesh $123, (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x43,0x14,0x00,0x0a,0x31,0x7b] + vrndscalesh $123, (%r9), %xmm29, %xmm30 + +// CHECK: vrndscalesh $123, 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x63,0x14,0x00,0x0a,0x71,0x7f,0x7b] + vrndscalesh $123, 254(%rcx), %xmm29, %xmm30 + +// CHECK: vrndscalesh $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x63,0x14,0x87,0x0a,0x72,0x80,0x7b] + vrndscalesh $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vrsqrtph %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x7d,0x48,0x4e,0xf5] + vrsqrtph %zmm29, %zmm30 + +// CHECK: vrsqrtph 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x7d,0x4f,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrsqrtph 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vrsqrtph (%r9){1to32}, %zmm30 +// CHECK: encoding: [0x62,0x46,0x7d,0x58,0x4e,0x31] + vrsqrtph (%r9){1to32}, %zmm30 + +// CHECK: vrsqrtph 8128(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x66,0x7d,0x48,0x4e,0x71,0x7f] + vrsqrtph 8128(%rcx), %zmm30 + +// CHECK: vrsqrtph -256(%rdx){1to32}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x7d,0xdf,0x4e,0x72,0x80] + vrsqrtph -256(%rdx){1to32}, %zmm30 {%k7} {z} + +// CHECK: vrsqrtsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0x4f,0xf4] + vrsqrtsh %xmm28, %xmm29, %xmm30 + +// CHECK: vrsqrtsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x07,0x4f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrsqrtsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vrsqrtsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x00,0x4f,0x31] + vrsqrtsh (%r9), %xmm29, %xmm30 + +// CHECK: vrsqrtsh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x00,0x4f,0x71,0x7f] + vrsqrtsh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vrsqrtsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0x87,0x4f,0x72,0x80] + vrsqrtsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vscalefph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x40,0x2c,0xf4] + vscalefph %zmm28, %zmm29, %zmm30 + +// CHECK: vscalefph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0x2c,0xf4] + vscalefph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vscalefph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x47,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vscalefph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vscalefph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x50,0x2c,0x31] + vscalefph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vscalefph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x40,0x2c,0x71,0x7f] + vscalefph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vscalefph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0xd7,0x2c,0x72,0x80] + vscalefph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vscalefsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0x2d,0xf4] + vscalefsh %xmm28, %xmm29, %xmm30 + +// CHECK: vscalefsh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x06,0x15,0x10,0x2d,0xf4] + vscalefsh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vscalefsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x26,0x15,0x07,0x2d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vscalefsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vscalefsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x46,0x15,0x00,0x2d,0x31] + vscalefsh (%r9), %xmm29, %xmm30 + +// CHECK: vscalefsh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x66,0x15,0x00,0x2d,0x71,0x7f] + vscalefsh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vscalefsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x66,0x15,0x87,0x2d,0x72,0x80] + vscalefsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vsqrtph %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x48,0x51,0xf5] + vsqrtph %zmm29, %zmm30 + +// CHECK: vsqrtph {rn-sae}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x51,0xf5] + vsqrtph {rn-sae}, %zmm29, %zmm30 + +// CHECK: vsqrtph 268435456(%rbp,%r14,8), %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7c,0x4f,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsqrtph 268435456(%rbp,%r14,8), %zmm30 {%k7} + +// CHECK: vsqrtph (%r9){1to32}, %zmm30 +// CHECK: encoding: [0x62,0x45,0x7c,0x58,0x51,0x31] + vsqrtph (%r9){1to32}, %zmm30 + +// CHECK: vsqrtph 8128(%rcx), %zmm30 +// CHECK: encoding: [0x62,0x65,0x7c,0x48,0x51,0x71,0x7f] + vsqrtph 8128(%rcx), %zmm30 + +// CHECK: vsqrtph -256(%rdx){1to32}, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7c,0xdf,0x51,0x72,0x80] + vsqrtph -256(%rdx){1to32}, %zmm30 {%k7} {z} + +// CHECK: vsqrtsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x00,0x51,0xf4] + vsqrtsh %xmm28, %xmm29, %xmm30 + +// CHECK: vsqrtsh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x10,0x51,0xf4] + vsqrtsh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vsqrtsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x16,0x07,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsqrtsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vsqrtsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x16,0x00,0x51,0x31] + vsqrtsh (%r9), %xmm29, %xmm30 + +// CHECK: vsqrtsh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x51,0x71,0x7f] + vsqrtsh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vsqrtsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x16,0x87,0x51,0x72,0x80] + vsqrtsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} diff --git a/llvm/test/MC/X86/avx512fp16vl.s b/llvm/test/MC/X86/avx512fp16vl.s index 466af9663d21a9..91c45a56a2e8a5 100644 --- a/llvm/test/MC/X86/avx512fp16vl.s +++ b/llvm/test/MC/X86/avx512fp16vl.s @@ -1135,3 +1135,359 @@ // CHECK: vcvtw2ph -256(%edx){1to16}, %ymm6 {%k7} {z} // CHECK: encoding: [0x62,0xf5,0x7e,0xbf,0x7d,0x72,0x80] vcvtw2ph -256(%edx){1to16}, %ymm6 {%k7} {z} + +// CHECK: vfpclassph $123, %xmm6, %k5 +// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x66,0xee,0x7b] + vfpclassph $123, %xmm6, %k5 + +// CHECK: vfpclassph $123, %ymm6, %k5 +// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x66,0xee,0x7b] + vfpclassph $123, %ymm6, %k5 + +// CHECK: vfpclassphx $123, 268435456(%esp,%esi,8), %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x66,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b] + vfpclassphx $123, 268435456(%esp,%esi,8), %k5 {%k7} + +// CHECK: vfpclassph $123, (%ecx){1to8}, %k5 +// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x66,0x29,0x7b] + vfpclassph $123, (%ecx){1to8}, %k5 + +// CHECK: vfpclassphx $123, 2032(%ecx), %k5 +// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x66,0x69,0x7f,0x7b] + vfpclassphx $123, 2032(%ecx), %k5 + +// CHECK: vfpclassph $123, -256(%edx){1to8}, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7c,0x1f,0x66,0x6a,0x80,0x7b] + vfpclassph $123, -256(%edx){1to8}, %k5 {%k7} + +// CHECK: vfpclassph $123, (%ecx){1to16}, %k5 +// CHECK: encoding: [0x62,0xf3,0x7c,0x38,0x66,0x29,0x7b] + vfpclassph $123, (%ecx){1to16}, %k5 + +// CHECK: vfpclassphy $123, 4064(%ecx), %k5 +// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x66,0x69,0x7f,0x7b] + vfpclassphy $123, 4064(%ecx), %k5 + +// CHECK: vfpclassph $123, -256(%edx){1to16}, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7c,0x3f,0x66,0x6a,0x80,0x7b] + vfpclassph $123, -256(%edx){1to16}, %k5 {%k7} + +// CHECK: vgetexpph %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x42,0xf5] + vgetexpph %xmm5, %xmm6 + +// CHECK: vgetexpph %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x42,0xf5] + vgetexpph %ymm5, %ymm6 + +// CHECK: vgetexpph 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7d,0x0f,0x42,0xb4,0xf4,0x00,0x00,0x00,0x10] + vgetexpph 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vgetexpph (%ecx){1to8}, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x18,0x42,0x31] + vgetexpph (%ecx){1to8}, %xmm6 + +// CHECK: vgetexpph 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x42,0x71,0x7f] + vgetexpph 2032(%ecx), %xmm6 + +// CHECK: vgetexpph -256(%edx){1to8}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7d,0x9f,0x42,0x72,0x80] + vgetexpph -256(%edx){1to8}, %xmm6 {%k7} {z} + +// CHECK: vgetexpph 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7d,0x2f,0x42,0xb4,0xf4,0x00,0x00,0x00,0x10] + vgetexpph 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vgetexpph (%ecx){1to16}, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x38,0x42,0x31] + vgetexpph (%ecx){1to16}, %ymm6 + +// CHECK: vgetexpph 4064(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x42,0x71,0x7f] + vgetexpph 4064(%ecx), %ymm6 + +// CHECK: vgetexpph -256(%edx){1to16}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7d,0xbf,0x42,0x72,0x80] + vgetexpph -256(%edx){1to16}, %ymm6 {%k7} {z} + +// CHECK: vgetmantph $123, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x26,0xf5,0x7b] + vgetmantph $123, %ymm5, %ymm6 + +// CHECK: vgetmantph $123, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x26,0xf5,0x7b] + vgetmantph $123, %xmm5, %xmm6 + +// CHECK: vgetmantph $123, 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x26,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b] + vgetmantph $123, 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vgetmantph $123, (%ecx){1to8}, %xmm6 +// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x26,0x31,0x7b] + vgetmantph $123, (%ecx){1to8}, %xmm6 + +// CHECK: vgetmantph $123, 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x26,0x71,0x7f,0x7b] + vgetmantph $123, 2032(%ecx), %xmm6 + +// CHECK: vgetmantph $123, -256(%edx){1to8}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7c,0x9f,0x26,0x72,0x80,0x7b] + vgetmantph $123, -256(%edx){1to8}, %xmm6 {%k7} {z} + +// CHECK: vgetmantph $123, 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7c,0x2f,0x26,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b] + vgetmantph $123, 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vgetmantph $123, (%ecx){1to16}, %ymm6 +// CHECK: encoding: [0x62,0xf3,0x7c,0x38,0x26,0x31,0x7b] + vgetmantph $123, (%ecx){1to16}, %ymm6 + +// CHECK: vgetmantph $123, 4064(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x26,0x71,0x7f,0x7b] + vgetmantph $123, 4064(%ecx), %ymm6 + +// CHECK: vgetmantph $123, -256(%edx){1to16}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7c,0xbf,0x26,0x72,0x80,0x7b] + vgetmantph $123, -256(%edx){1to16}, %ymm6 {%k7} {z} + +// CHECK: vrcpph %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x4c,0xf5] + vrcpph %xmm5, %xmm6 + +// CHECK: vrcpph %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x4c,0xf5] + vrcpph %ymm5, %ymm6 + +// CHECK: vrcpph 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7d,0x0f,0x4c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vrcpph 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vrcpph (%ecx){1to8}, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x18,0x4c,0x31] + vrcpph (%ecx){1to8}, %xmm6 + +// CHECK: vrcpph 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x4c,0x71,0x7f] + vrcpph 2032(%ecx), %xmm6 + +// CHECK: vrcpph -256(%edx){1to8}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7d,0x9f,0x4c,0x72,0x80] + vrcpph -256(%edx){1to8}, %xmm6 {%k7} {z} + +// CHECK: vrcpph 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7d,0x2f,0x4c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vrcpph 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vrcpph (%ecx){1to16}, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x38,0x4c,0x31] + vrcpph (%ecx){1to16}, %ymm6 + +// CHECK: vrcpph 4064(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x4c,0x71,0x7f] + vrcpph 4064(%ecx), %ymm6 + +// CHECK: vrcpph -256(%edx){1to16}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7d,0xbf,0x4c,0x72,0x80] + vrcpph -256(%edx){1to16}, %ymm6 {%k7} {z} + +// CHECK: vreduceph $123, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x56,0xf5,0x7b] + vreduceph $123, %ymm5, %ymm6 + +// CHECK: vreduceph $123, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x56,0xf5,0x7b] + vreduceph $123, %xmm5, %xmm6 + +// CHECK: vreduceph $123, 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b] + vreduceph $123, 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vreduceph $123, (%ecx){1to8}, %xmm6 +// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x56,0x31,0x7b] + vreduceph $123, (%ecx){1to8}, %xmm6 + +// CHECK: vreduceph $123, 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x56,0x71,0x7f,0x7b] + vreduceph $123, 2032(%ecx), %xmm6 + +// CHECK: vreduceph $123, -256(%edx){1to8}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7c,0x9f,0x56,0x72,0x80,0x7b] + vreduceph $123, -256(%edx){1to8}, %xmm6 {%k7} {z} + +// CHECK: vreduceph $123, 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7c,0x2f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b] + vreduceph $123, 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vreduceph $123, (%ecx){1to16}, %ymm6 +// CHECK: encoding: [0x62,0xf3,0x7c,0x38,0x56,0x31,0x7b] + vreduceph $123, (%ecx){1to16}, %ymm6 + +// CHECK: vreduceph $123, 4064(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x56,0x71,0x7f,0x7b] + vreduceph $123, 4064(%ecx), %ymm6 + +// CHECK: vreduceph $123, -256(%edx){1to16}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7c,0xbf,0x56,0x72,0x80,0x7b] + vreduceph $123, -256(%edx){1to16}, %ymm6 {%k7} {z} + +// CHECK: vrndscaleph $123, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x08,0xf5,0x7b] + vrndscaleph $123, %ymm5, %ymm6 + +// CHECK: vrndscaleph $123, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x08,0xf5,0x7b] + vrndscaleph $123, %xmm5, %xmm6 + +// CHECK: vrndscaleph $123, 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x08,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b] + vrndscaleph $123, 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vrndscaleph $123, (%ecx){1to8}, %xmm6 +// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x08,0x31,0x7b] + vrndscaleph $123, (%ecx){1to8}, %xmm6 + +// CHECK: vrndscaleph $123, 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x08,0x71,0x7f,0x7b] + vrndscaleph $123, 2032(%ecx), %xmm6 + +// CHECK: vrndscaleph $123, -256(%edx){1to8}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7c,0x9f,0x08,0x72,0x80,0x7b] + vrndscaleph $123, -256(%edx){1to8}, %xmm6 {%k7} {z} + +// CHECK: vrndscaleph $123, 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf3,0x7c,0x2f,0x08,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b] + vrndscaleph $123, 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vrndscaleph $123, (%ecx){1to16}, %ymm6 +// CHECK: encoding: [0x62,0xf3,0x7c,0x38,0x08,0x31,0x7b] + vrndscaleph $123, (%ecx){1to16}, %ymm6 + +// CHECK: vrndscaleph $123, 4064(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x08,0x71,0x7f,0x7b] + vrndscaleph $123, 4064(%ecx), %ymm6 + +// CHECK: vrndscaleph $123, -256(%edx){1to16}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf3,0x7c,0xbf,0x08,0x72,0x80,0x7b] + vrndscaleph $123, -256(%edx){1to16}, %ymm6 {%k7} {z} + +// CHECK: vrsqrtph %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x4e,0xf5] + vrsqrtph %xmm5, %xmm6 + +// CHECK: vrsqrtph %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x4e,0xf5] + vrsqrtph %ymm5, %ymm6 + +// CHECK: vrsqrtph 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7d,0x0f,0x4e,0xb4,0xf4,0x00,0x00,0x00,0x10] + vrsqrtph 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vrsqrtph (%ecx){1to8}, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x18,0x4e,0x31] + vrsqrtph (%ecx){1to8}, %xmm6 + +// CHECK: vrsqrtph 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x4e,0x71,0x7f] + vrsqrtph 2032(%ecx), %xmm6 + +// CHECK: vrsqrtph -256(%edx){1to8}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7d,0x9f,0x4e,0x72,0x80] + vrsqrtph -256(%edx){1to8}, %xmm6 {%k7} {z} + +// CHECK: vrsqrtph 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x7d,0x2f,0x4e,0xb4,0xf4,0x00,0x00,0x00,0x10] + vrsqrtph 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vrsqrtph (%ecx){1to16}, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x38,0x4e,0x31] + vrsqrtph (%ecx){1to16}, %ymm6 + +// CHECK: vrsqrtph 4064(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x4e,0x71,0x7f] + vrsqrtph 4064(%ecx), %ymm6 + +// CHECK: vrsqrtph -256(%edx){1to16}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x7d,0xbf,0x4e,0x72,0x80] + vrsqrtph -256(%edx){1to16}, %ymm6 {%k7} {z} + +// CHECK: vscalefph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x2c,0xf4] + vscalefph %ymm4, %ymm5, %ymm6 + +// CHECK: vscalefph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x2c,0xf4] + vscalefph %xmm4, %xmm5, %xmm6 + +// CHECK: vscalefph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0x2c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vscalefph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vscalefph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x38,0x2c,0x31] + vscalefph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vscalefph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x2c,0x71,0x7f] + vscalefph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vscalefph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0x2c,0x72,0x80] + vscalefph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vscalefph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x2c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vscalefph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vscalefph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x2c,0x31] + vscalefph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vscalefph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x2c,0x71,0x7f] + vscalefph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vscalefph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0x2c,0x72,0x80] + vscalefph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vsqrtph %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x51,0xf5] + vsqrtph %xmm5, %xmm6 + +// CHECK: vsqrtph %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x51,0xf5] + vsqrtph %ymm5, %ymm6 + +// CHECK: vsqrtph 268435456(%esp,%esi,8), %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7c,0x0f,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10] + vsqrtph 268435456(%esp,%esi,8), %xmm6 {%k7} + +// CHECK: vsqrtph (%ecx){1to8}, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x51,0x31] + vsqrtph (%ecx){1to8}, %xmm6 + +// CHECK: vsqrtph 2032(%ecx), %xmm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x51,0x71,0x7f] + vsqrtph 2032(%ecx), %xmm6 + +// CHECK: vsqrtph -256(%edx){1to8}, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7c,0x9f,0x51,0x72,0x80] + vsqrtph -256(%edx){1to8}, %xmm6 {%k7} {z} + +// CHECK: vsqrtph 268435456(%esp,%esi,8), %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x7c,0x2f,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10] + vsqrtph 268435456(%esp,%esi,8), %ymm6 {%k7} + +// CHECK: vsqrtph (%ecx){1to16}, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x38,0x51,0x31] + vsqrtph (%ecx){1to16}, %ymm6 + +// CHECK: vsqrtph 4064(%ecx), %ymm6 +// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x51,0x71,0x7f] + vsqrtph 4064(%ecx), %ymm6 + +// CHECK: vsqrtph -256(%edx){1to16}, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x7c,0xbf,0x51,0x72,0x80] + vsqrtph -256(%edx){1to16}, %ymm6 {%k7} {z} diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16.s b/llvm/test/MC/X86/intel-syntax-avx512fp16.s index 4b842f9bc622c9..36ca110e12e6e4 100644 --- a/llvm/test/MC/X86/intel-syntax-avx512fp16.s +++ b/llvm/test/MC/X86/intel-syntax-avx512fp16.s @@ -1227,3 +1227,411 @@ // CHECK: vcvtw2ph zmm6 {k7} {z}, word ptr [edx - 256]{1to32} // CHECK: encoding: [0x62,0xf5,0x7e,0xdf,0x7d,0x72,0x80] vcvtw2ph zmm6 {k7} {z}, word ptr [edx - 256]{1to32} + +// CHECK: vfpclassph k5, zmm6, 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x66,0xee,0x7b] + vfpclassph k5, zmm6, 123 + +// CHECK: vfpclassph k5 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x4f,0x66,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b] + vfpclassph k5 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vfpclassph k5, word ptr [ecx]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x58,0x66,0x29,0x7b] + vfpclassph k5, word ptr [ecx]{1to32}, 123 + +// CHECK: vfpclassph k5, zmmword ptr [ecx + 8128], 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x66,0x69,0x7f,0x7b] + vfpclassph k5, zmmword ptr [ecx + 8128], 123 + +// CHECK: vfpclassph k5 {k7}, word ptr [edx - 256]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x5f,0x66,0x6a,0x80,0x7b] + vfpclassph k5 {k7}, word ptr [edx - 256]{1to32}, 123 + +// CHECK: vfpclasssh k5, xmm6, 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x67,0xee,0x7b] + vfpclasssh k5, xmm6, 123 + +// CHECK: vfpclasssh k5 {k7}, word ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x67,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b] + vfpclasssh k5 {k7}, word ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vfpclasssh k5, word ptr [ecx], 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x67,0x29,0x7b] + vfpclasssh k5, word ptr [ecx], 123 + +// CHECK: vfpclasssh k5, word ptr [ecx + 254], 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x67,0x69,0x7f,0x7b] + vfpclasssh k5, word ptr [ecx + 254], 123 + +// CHECK: vfpclasssh k5 {k7}, word ptr [edx - 256], 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x67,0x6a,0x80,0x7b] + vfpclasssh k5 {k7}, word ptr [edx - 256], 123 + +// CHECK: vgetexpph zmm6, zmm5 +// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x42,0xf5] + vgetexpph zmm6, zmm5 + +// CHECK: vgetexpph zmm6, zmm5, {sae} +// CHECK: encoding: [0x62,0xf6,0x7d,0x18,0x42,0xf5] + vgetexpph zmm6, zmm5, {sae} + +// CHECK: vgetexpph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x7d,0x4f,0x42,0xb4,0xf4,0x00,0x00,0x00,0x10] + vgetexpph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vgetexpph zmm6, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x7d,0x58,0x42,0x31] + vgetexpph zmm6, word ptr [ecx]{1to32} + +// CHECK: vgetexpph zmm6, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x42,0x71,0x7f] + vgetexpph zmm6, zmmword ptr [ecx + 8128] + +// CHECK: vgetexpph zmm6 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x7d,0xdf,0x42,0x72,0x80] + vgetexpph zmm6 {k7} {z}, word ptr [edx - 256]{1to32} + +// CHECK: vgetexpsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x43,0xf4] + vgetexpsh xmm6, xmm5, xmm4 + +// CHECK: vgetexpsh xmm6, xmm5, xmm4, {sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x43,0xf4] + vgetexpsh xmm6, xmm5, xmm4, {sae} + +// CHECK: vgetexpsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x43,0xb4,0xf4,0x00,0x00,0x00,0x10] + vgetexpsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vgetexpsh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x43,0x31] + vgetexpsh xmm6, xmm5, word ptr [ecx] + +// CHECK: vgetexpsh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x43,0x71,0x7f] + vgetexpsh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vgetexpsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x43,0x72,0x80] + vgetexpsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vgetmantph zmm6, zmm5, 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x26,0xf5,0x7b] + vgetmantph zmm6, zmm5, 123 + +// CHECK: vgetmantph zmm6, zmm5, {sae}, 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x26,0xf5,0x7b] + vgetmantph zmm6, zmm5, {sae}, 123 + +// CHECK: vgetmantph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x4f,0x26,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b] + vgetmantph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vgetmantph zmm6, word ptr [ecx]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x58,0x26,0x31,0x7b] + vgetmantph zmm6, word ptr [ecx]{1to32}, 123 + +// CHECK: vgetmantph zmm6, zmmword ptr [ecx + 8128], 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x26,0x71,0x7f,0x7b] + vgetmantph zmm6, zmmword ptr [ecx + 8128], 123 + +// CHECK: vgetmantph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0xdf,0x26,0x72,0x80,0x7b] + vgetmantph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}, 123 + +// CHECK: vgetmantsh xmm6, xmm5, xmm4, 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x27,0xf4,0x7b] + vgetmantsh xmm6, xmm5, xmm4, 123 + +// CHECK: vgetmantsh xmm6, xmm5, xmm4, {sae}, 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x18,0x27,0xf4,0x7b] + vgetmantsh xmm6, xmm5, xmm4, {sae}, 123 + +// CHECK: vgetmantsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x0f,0x27,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b] + vgetmantsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vgetmantsh xmm6, xmm5, word ptr [ecx], 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x27,0x31,0x7b] + vgetmantsh xmm6, xmm5, word ptr [ecx], 123 + +// CHECK: vgetmantsh xmm6, xmm5, word ptr [ecx + 254], 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x27,0x71,0x7f,0x7b] + vgetmantsh xmm6, xmm5, word ptr [ecx + 254], 123 + +// CHECK: vgetmantsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256], 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x8f,0x27,0x72,0x80,0x7b] + vgetmantsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256], 123 + +// CHECK: vrcpph zmm6, zmm5 +// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x4c,0xf5] + vrcpph zmm6, zmm5 + +// CHECK: vrcpph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x7d,0x4f,0x4c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vrcpph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vrcpph zmm6, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x7d,0x58,0x4c,0x31] + vrcpph zmm6, word ptr [ecx]{1to32} + +// CHECK: vrcpph zmm6, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x4c,0x71,0x7f] + vrcpph zmm6, zmmword ptr [ecx + 8128] + +// CHECK: vrcpph zmm6 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x7d,0xdf,0x4c,0x72,0x80] + vrcpph zmm6 {k7} {z}, word ptr [edx - 256]{1to32} + +// CHECK: vrcpsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x4d,0xf4] + vrcpsh xmm6, xmm5, xmm4 + +// CHECK: vrcpsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x4d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vrcpsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vrcpsh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x4d,0x31] + vrcpsh xmm6, xmm5, word ptr [ecx] + +// CHECK: vrcpsh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x4d,0x71,0x7f] + vrcpsh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vrcpsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x4d,0x72,0x80] + vrcpsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vreduceph zmm6, zmm5, 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x56,0xf5,0x7b] + vreduceph zmm6, zmm5, 123 + +// CHECK: vreduceph zmm6, zmm5, {sae}, 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x56,0xf5,0x7b] + vreduceph zmm6, zmm5, {sae}, 123 + +// CHECK: vreduceph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x4f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b] + vreduceph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vreduceph zmm6, word ptr [ecx]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x58,0x56,0x31,0x7b] + vreduceph zmm6, word ptr [ecx]{1to32}, 123 + +// CHECK: vreduceph zmm6, zmmword ptr [ecx + 8128], 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x56,0x71,0x7f,0x7b] + vreduceph zmm6, zmmword ptr [ecx + 8128], 123 + +// CHECK: vreduceph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0xdf,0x56,0x72,0x80,0x7b] + vreduceph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}, 123 + +// CHECK: vreducesh xmm6, xmm5, xmm4, 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x57,0xf4,0x7b] + vreducesh xmm6, xmm5, xmm4, 123 + +// CHECK: vreducesh xmm6, xmm5, xmm4, {sae}, 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x18,0x57,0xf4,0x7b] + vreducesh xmm6, xmm5, xmm4, {sae}, 123 + +// CHECK: vreducesh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x0f,0x57,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b] + vreducesh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vreducesh xmm6, xmm5, word ptr [ecx], 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x57,0x31,0x7b] + vreducesh xmm6, xmm5, word ptr [ecx], 123 + +// CHECK: vreducesh xmm6, xmm5, word ptr [ecx + 254], 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x57,0x71,0x7f,0x7b] + vreducesh xmm6, xmm5, word ptr [ecx + 254], 123 + +// CHECK: vreducesh xmm6 {k7} {z}, xmm5, word ptr [edx - 256], 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x8f,0x57,0x72,0x80,0x7b] + vreducesh xmm6 {k7} {z}, xmm5, word ptr [edx - 256], 123 + +// CHECK: vrndscaleph zmm6, zmm5, 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x08,0xf5,0x7b] + vrndscaleph zmm6, zmm5, 123 + +// CHECK: vrndscaleph zmm6, zmm5, {sae}, 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x08,0xf5,0x7b] + vrndscaleph zmm6, zmm5, {sae}, 123 + +// CHECK: vrndscaleph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x4f,0x08,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b] + vrndscaleph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vrndscaleph zmm6, word ptr [ecx]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x58,0x08,0x31,0x7b] + vrndscaleph zmm6, word ptr [ecx]{1to32}, 123 + +// CHECK: vrndscaleph zmm6, zmmword ptr [ecx + 8128], 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x08,0x71,0x7f,0x7b] + vrndscaleph zmm6, zmmword ptr [ecx + 8128], 123 + +// CHECK: vrndscaleph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0xdf,0x08,0x72,0x80,0x7b] + vrndscaleph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}, 123 + +// CHECK: vrndscalesh xmm6, xmm5, xmm4, 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x0a,0xf4,0x7b] + vrndscalesh xmm6, xmm5, xmm4, 123 + +// CHECK: vrndscalesh xmm6, xmm5, xmm4, {sae}, 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x18,0x0a,0xf4,0x7b] + vrndscalesh xmm6, xmm5, xmm4, {sae}, 123 + +// CHECK: vrndscalesh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x0f,0x0a,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b] + vrndscalesh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vrndscalesh xmm6, xmm5, word ptr [ecx], 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x0a,0x31,0x7b] + vrndscalesh xmm6, xmm5, word ptr [ecx], 123 + +// CHECK: vrndscalesh xmm6, xmm5, word ptr [ecx + 254], 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x0a,0x71,0x7f,0x7b] + vrndscalesh xmm6, xmm5, word ptr [ecx + 254], 123 + +// CHECK: vrndscalesh xmm6 {k7} {z}, xmm5, word ptr [edx - 256], 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x8f,0x0a,0x72,0x80,0x7b] + vrndscalesh xmm6 {k7} {z}, xmm5, word ptr [edx - 256], 123 + +// CHECK: vrsqrtph zmm6, zmm5 +// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x4e,0xf5] + vrsqrtph zmm6, zmm5 + +// CHECK: vrsqrtph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x7d,0x4f,0x4e,0xb4,0xf4,0x00,0x00,0x00,0x10] + vrsqrtph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vrsqrtph zmm6, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x7d,0x58,0x4e,0x31] + vrsqrtph zmm6, word ptr [ecx]{1to32} + +// CHECK: vrsqrtph zmm6, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x4e,0x71,0x7f] + vrsqrtph zmm6, zmmword ptr [ecx + 8128] + +// CHECK: vrsqrtph zmm6 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x7d,0xdf,0x4e,0x72,0x80] + vrsqrtph zmm6 {k7} {z}, word ptr [edx - 256]{1to32} + +// CHECK: vrsqrtsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x4f,0xf4] + vrsqrtsh xmm6, xmm5, xmm4 + +// CHECK: vrsqrtsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x4f,0xb4,0xf4,0x00,0x00,0x00,0x10] + vrsqrtsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vrsqrtsh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x4f,0x31] + vrsqrtsh xmm6, xmm5, word ptr [ecx] + +// CHECK: vrsqrtsh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x4f,0x71,0x7f] + vrsqrtsh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vrsqrtsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x4f,0x72,0x80] + vrsqrtsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vscalefph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x2c,0xf4] + vscalefph zmm6, zmm5, zmm4 + +// CHECK: vscalefph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x2c,0xf4] + vscalefph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vscalefph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0x2c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vscalefph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vscalefph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0x58,0x2c,0x31] + vscalefph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vscalefph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x2c,0x71,0x7f] + vscalefph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vscalefph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0x2c,0x72,0x80] + vscalefph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vscalefsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x2d,0xf4] + vscalefsh xmm6, xmm5, xmm4 + +// CHECK: vscalefsh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x2d,0xf4] + vscalefsh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vscalefsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x2d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vscalefsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vscalefsh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x2d,0x31] + vscalefsh xmm6, xmm5, word ptr [ecx] + +// CHECK: vscalefsh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x2d,0x71,0x7f] + vscalefsh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vscalefsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x2d,0x72,0x80] + vscalefsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vsqrtph zmm6, zmm5 +// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x51,0xf5] + vsqrtph zmm6, zmm5 + +// CHECK: vsqrtph zmm6, zmm5, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x51,0xf5] + vsqrtph zmm6, zmm5, {rn-sae} + +// CHECK: vsqrtph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7c,0x4f,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10] + vsqrtph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vsqrtph zmm6, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7c,0x58,0x51,0x31] + vsqrtph zmm6, word ptr [ecx]{1to32} + +// CHECK: vsqrtph zmm6, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x51,0x71,0x7f] + vsqrtph zmm6, zmmword ptr [ecx + 8128] + +// CHECK: vsqrtph zmm6 {k7} {z}, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x7c,0xdf,0x51,0x72,0x80] + vsqrtph zmm6 {k7} {z}, word ptr [edx - 256]{1to32} + +// CHECK: vsqrtsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x51,0xf4] + vsqrtsh xmm6, xmm5, xmm4 + +// CHECK: vsqrtsh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x51,0xf4] + vsqrtsh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vsqrtsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10] + vsqrtsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vsqrtsh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x51,0x31] + vsqrtsh xmm6, xmm5, word ptr [ecx] + +// CHECK: vsqrtsh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x51,0x71,0x7f] + vsqrtsh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vsqrtsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x51,0x72,0x80] + vsqrtsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s index 5c53fc376e1cca..6091599b87d666 100644 --- a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s +++ b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s @@ -1135,3 +1135,359 @@ // CHECK: vcvtw2ph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} // CHECK: encoding: [0x62,0x65,0x7e,0xbf,0x7d,0x72,0x80] vcvtw2ph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} + +// CHECK: vfpclassph k5, xmm30, 123 +// CHECK: encoding: [0x62,0x93,0x7c,0x08,0x66,0xee,0x7b] + vfpclassph k5, xmm30, 123 + +// CHECK: vfpclassph k5, ymm30, 123 +// CHECK: encoding: [0x62,0x93,0x7c,0x28,0x66,0xee,0x7b] + vfpclassph k5, ymm30, 123 + +// CHECK: vfpclassph k5 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0xb3,0x7c,0x0f,0x66,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b] + vfpclassph k5 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vfpclassph k5, word ptr [r9]{1to8}, 123 +// CHECK: encoding: [0x62,0xd3,0x7c,0x18,0x66,0x29,0x7b] + vfpclassph k5, word ptr [r9]{1to8}, 123 + +// CHECK: vfpclassph k5, xmmword ptr [rcx + 2032], 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x66,0x69,0x7f,0x7b] + vfpclassph k5, xmmword ptr [rcx + 2032], 123 + +// CHECK: vfpclassph k5 {k7}, word ptr [rdx - 256]{1to8}, 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x1f,0x66,0x6a,0x80,0x7b] + vfpclassph k5 {k7}, word ptr [rdx - 256]{1to8}, 123 + +// CHECK: vfpclassph k5, word ptr [r9]{1to16}, 123 +// CHECK: encoding: [0x62,0xd3,0x7c,0x38,0x66,0x29,0x7b] + vfpclassph k5, word ptr [r9]{1to16}, 123 + +// CHECK: vfpclassph k5, ymmword ptr [rcx + 4064], 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x66,0x69,0x7f,0x7b] + vfpclassph k5, ymmword ptr [rcx + 4064], 123 + +// CHECK: vfpclassph k5 {k7}, word ptr [rdx - 256]{1to16}, 123 +// CHECK: encoding: [0x62,0xf3,0x7c,0x3f,0x66,0x6a,0x80,0x7b] + vfpclassph k5 {k7}, word ptr [rdx - 256]{1to16}, 123 + +// CHECK: vgetexpph xmm30, xmm29 +// CHECK: encoding: [0x62,0x06,0x7d,0x08,0x42,0xf5] + vgetexpph xmm30, xmm29 + +// CHECK: vgetexpph ymm30, ymm29 +// CHECK: encoding: [0x62,0x06,0x7d,0x28,0x42,0xf5] + vgetexpph ymm30, ymm29 + +// CHECK: vgetexpph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x7d,0x0f,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10] + vgetexpph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vgetexpph xmm30, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x7d,0x18,0x42,0x31] + vgetexpph xmm30, word ptr [r9]{1to8} + +// CHECK: vgetexpph xmm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x7d,0x08,0x42,0x71,0x7f] + vgetexpph xmm30, xmmword ptr [rcx + 2032] + +// CHECK: vgetexpph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x7d,0x9f,0x42,0x72,0x80] + vgetexpph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vgetexpph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x7d,0x2f,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10] + vgetexpph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vgetexpph ymm30, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x7d,0x38,0x42,0x31] + vgetexpph ymm30, word ptr [r9]{1to16} + +// CHECK: vgetexpph ymm30, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x7d,0x28,0x42,0x71,0x7f] + vgetexpph ymm30, ymmword ptr [rcx + 4064] + +// CHECK: vgetexpph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x7d,0xbf,0x42,0x72,0x80] + vgetexpph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} + +// CHECK: vgetmantph ymm30, ymm29, 123 +// CHECK: encoding: [0x62,0x03,0x7c,0x28,0x26,0xf5,0x7b] + vgetmantph ymm30, ymm29, 123 + +// CHECK: vgetmantph xmm30, xmm29, 123 +// CHECK: encoding: [0x62,0x03,0x7c,0x08,0x26,0xf5,0x7b] + vgetmantph xmm30, xmm29, 123 + +// CHECK: vgetmantph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0x23,0x7c,0x0f,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vgetmantph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vgetmantph xmm30, word ptr [r9]{1to8}, 123 +// CHECK: encoding: [0x62,0x43,0x7c,0x18,0x26,0x31,0x7b] + vgetmantph xmm30, word ptr [r9]{1to8}, 123 + +// CHECK: vgetmantph xmm30, xmmword ptr [rcx + 2032], 123 +// CHECK: encoding: [0x62,0x63,0x7c,0x08,0x26,0x71,0x7f,0x7b] + vgetmantph xmm30, xmmword ptr [rcx + 2032], 123 + +// CHECK: vgetmantph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}, 123 +// CHECK: encoding: [0x62,0x63,0x7c,0x9f,0x26,0x72,0x80,0x7b] + vgetmantph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}, 123 + +// CHECK: vgetmantph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0x23,0x7c,0x2f,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vgetmantph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vgetmantph ymm30, word ptr [r9]{1to16}, 123 +// CHECK: encoding: [0x62,0x43,0x7c,0x38,0x26,0x31,0x7b] + vgetmantph ymm30, word ptr [r9]{1to16}, 123 + +// CHECK: vgetmantph ymm30, ymmword ptr [rcx + 4064], 123 +// CHECK: encoding: [0x62,0x63,0x7c,0x28,0x26,0x71,0x7f,0x7b] + vgetmantph ymm30, ymmword ptr [rcx + 4064], 123 + +// CHECK: vgetmantph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}, 123 +// CHECK: encoding: [0x62,0x63,0x7c,0xbf,0x26,0x72,0x80,0x7b] + vgetmantph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}, 123 + +// CHECK: vrcpph xmm30, xmm29 +// CHECK: encoding: [0x62,0x06,0x7d,0x08,0x4c,0xf5] + vrcpph xmm30, xmm29 + +// CHECK: vrcpph ymm30, ymm29 +// CHECK: encoding: [0x62,0x06,0x7d,0x28,0x4c,0xf5] + vrcpph ymm30, ymm29 + +// CHECK: vrcpph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x7d,0x0f,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrcpph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vrcpph xmm30, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x7d,0x18,0x4c,0x31] + vrcpph xmm30, word ptr [r9]{1to8} + +// CHECK: vrcpph xmm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x7d,0x08,0x4c,0x71,0x7f] + vrcpph xmm30, xmmword ptr [rcx + 2032] + +// CHECK: vrcpph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x7d,0x9f,0x4c,0x72,0x80] + vrcpph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vrcpph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x7d,0x2f,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrcpph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vrcpph ymm30, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x7d,0x38,0x4c,0x31] + vrcpph ymm30, word ptr [r9]{1to16} + +// CHECK: vrcpph ymm30, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x7d,0x28,0x4c,0x71,0x7f] + vrcpph ymm30, ymmword ptr [rcx + 4064] + +// CHECK: vrcpph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x7d,0xbf,0x4c,0x72,0x80] + vrcpph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} + +// CHECK: vreduceph ymm30, ymm29, 123 +// CHECK: encoding: [0x62,0x03,0x7c,0x28,0x56,0xf5,0x7b] + vreduceph ymm30, ymm29, 123 + +// CHECK: vreduceph xmm30, xmm29, 123 +// CHECK: encoding: [0x62,0x03,0x7c,0x08,0x56,0xf5,0x7b] + vreduceph xmm30, xmm29, 123 + +// CHECK: vreduceph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0x23,0x7c,0x0f,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vreduceph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vreduceph xmm30, word ptr [r9]{1to8}, 123 +// CHECK: encoding: [0x62,0x43,0x7c,0x18,0x56,0x31,0x7b] + vreduceph xmm30, word ptr [r9]{1to8}, 123 + +// CHECK: vreduceph xmm30, xmmword ptr [rcx + 2032], 123 +// CHECK: encoding: [0x62,0x63,0x7c,0x08,0x56,0x71,0x7f,0x7b] + vreduceph xmm30, xmmword ptr [rcx + 2032], 123 + +// CHECK: vreduceph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}, 123 +// CHECK: encoding: [0x62,0x63,0x7c,0x9f,0x56,0x72,0x80,0x7b] + vreduceph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}, 123 + +// CHECK: vreduceph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0x23,0x7c,0x2f,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vreduceph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vreduceph ymm30, word ptr [r9]{1to16}, 123 +// CHECK: encoding: [0x62,0x43,0x7c,0x38,0x56,0x31,0x7b] + vreduceph ymm30, word ptr [r9]{1to16}, 123 + +// CHECK: vreduceph ymm30, ymmword ptr [rcx + 4064], 123 +// CHECK: encoding: [0x62,0x63,0x7c,0x28,0x56,0x71,0x7f,0x7b] + vreduceph ymm30, ymmword ptr [rcx + 4064], 123 + +// CHECK: vreduceph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}, 123 +// CHECK: encoding: [0x62,0x63,0x7c,0xbf,0x56,0x72,0x80,0x7b] + vreduceph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}, 123 + +// CHECK: vrndscaleph ymm30, ymm29, 123 +// CHECK: encoding: [0x62,0x03,0x7c,0x28,0x08,0xf5,0x7b] + vrndscaleph ymm30, ymm29, 123 + +// CHECK: vrndscaleph xmm30, xmm29, 123 +// CHECK: encoding: [0x62,0x03,0x7c,0x08,0x08,0xf5,0x7b] + vrndscaleph xmm30, xmm29, 123 + +// CHECK: vrndscaleph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0x23,0x7c,0x0f,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vrndscaleph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vrndscaleph xmm30, word ptr [r9]{1to8}, 123 +// CHECK: encoding: [0x62,0x43,0x7c,0x18,0x08,0x31,0x7b] + vrndscaleph xmm30, word ptr [r9]{1to8}, 123 + +// CHECK: vrndscaleph xmm30, xmmword ptr [rcx + 2032], 123 +// CHECK: encoding: [0x62,0x63,0x7c,0x08,0x08,0x71,0x7f,0x7b] + vrndscaleph xmm30, xmmword ptr [rcx + 2032], 123 + +// CHECK: vrndscaleph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}, 123 +// CHECK: encoding: [0x62,0x63,0x7c,0x9f,0x08,0x72,0x80,0x7b] + vrndscaleph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}, 123 + +// CHECK: vrndscaleph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0x23,0x7c,0x2f,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b] + vrndscaleph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vrndscaleph ymm30, word ptr [r9]{1to16}, 123 +// CHECK: encoding: [0x62,0x43,0x7c,0x38,0x08,0x31,0x7b] + vrndscaleph ymm30, word ptr [r9]{1to16}, 123 + +// CHECK: vrndscaleph ymm30, ymmword ptr [rcx + 4064], 123 +// CHECK: encoding: [0x62,0x63,0x7c,0x28,0x08,0x71,0x7f,0x7b] + vrndscaleph ymm30, ymmword ptr [rcx + 4064], 123 + +// CHECK: vrndscaleph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}, 123 +// CHECK: encoding: [0x62,0x63,0x7c,0xbf,0x08,0x72,0x80,0x7b] + vrndscaleph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}, 123 + +// CHECK: vrsqrtph xmm30, xmm29 +// CHECK: encoding: [0x62,0x06,0x7d,0x08,0x4e,0xf5] + vrsqrtph xmm30, xmm29 + +// CHECK: vrsqrtph ymm30, ymm29 +// CHECK: encoding: [0x62,0x06,0x7d,0x28,0x4e,0xf5] + vrsqrtph ymm30, ymm29 + +// CHECK: vrsqrtph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x7d,0x0f,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrsqrtph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vrsqrtph xmm30, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x7d,0x18,0x4e,0x31] + vrsqrtph xmm30, word ptr [r9]{1to8} + +// CHECK: vrsqrtph xmm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x7d,0x08,0x4e,0x71,0x7f] + vrsqrtph xmm30, xmmword ptr [rcx + 2032] + +// CHECK: vrsqrtph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x7d,0x9f,0x4e,0x72,0x80] + vrsqrtph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vrsqrtph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x7d,0x2f,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vrsqrtph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vrsqrtph ymm30, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x7d,0x38,0x4e,0x31] + vrsqrtph ymm30, word ptr [r9]{1to16} + +// CHECK: vrsqrtph ymm30, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x7d,0x28,0x4e,0x71,0x7f] + vrsqrtph ymm30, ymmword ptr [rcx + 4064] + +// CHECK: vrsqrtph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x7d,0xbf,0x4e,0x72,0x80] + vrsqrtph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} + +// CHECK: vscalefph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x06,0x15,0x20,0x2c,0xf4] + vscalefph ymm30, ymm29, ymm28 + +// CHECK: vscalefph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x06,0x15,0x00,0x2c,0xf4] + vscalefph xmm30, xmm29, xmm28 + +// CHECK: vscalefph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x27,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vscalefph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vscalefph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x46,0x15,0x30,0x2c,0x31] + vscalefph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vscalefph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x66,0x15,0x20,0x2c,0x71,0x7f] + vscalefph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vscalefph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x66,0x15,0xb7,0x2c,0x72,0x80] + vscalefph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vscalefph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x26,0x15,0x07,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vscalefph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vscalefph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x46,0x15,0x10,0x2c,0x31] + vscalefph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vscalefph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x66,0x15,0x00,0x2c,0x71,0x7f] + vscalefph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vscalefph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x66,0x15,0x97,0x2c,0x72,0x80] + vscalefph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vsqrtph xmm30, xmm29 +// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x51,0xf5] + vsqrtph xmm30, xmm29 + +// CHECK: vsqrtph ymm30, ymm29 +// CHECK: encoding: [0x62,0x05,0x7c,0x28,0x51,0xf5] + vsqrtph ymm30, ymm29 + +// CHECK: vsqrtph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7c,0x0f,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsqrtph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vsqrtph xmm30, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x7c,0x18,0x51,0x31] + vsqrtph xmm30, word ptr [r9]{1to8} + +// CHECK: vsqrtph xmm30, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x51,0x71,0x7f] + vsqrtph xmm30, xmmword ptr [rcx + 2032] + +// CHECK: vsqrtph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x7c,0x9f,0x51,0x72,0x80] + vsqrtph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8} + +// CHECK: vsqrtph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x7c,0x2f,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsqrtph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vsqrtph ymm30, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x7c,0x38,0x51,0x31] + vsqrtph ymm30, word ptr [r9]{1to16} + +// CHECK: vsqrtph ymm30, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x7c,0x28,0x51,0x71,0x7f] + vsqrtph ymm30, ymmword ptr [rcx + 4064] + +// CHECK: vsqrtph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x7c,0xbf,0x51,0x72,0x80] + vsqrtph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}