Skip to content

Commit

Permalink
[X86][FP16] Change the order of the operands in complex FMA intrinsic…
Browse files Browse the repository at this point in the history
…s to allow swap between the mul operands.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D109658
  • Loading branch information
phoebewang committed Sep 23, 2021
1 parent 47cc166 commit ebec077
Show file tree
Hide file tree
Showing 14 changed files with 392 additions and 212 deletions.
76 changes: 38 additions & 38 deletions clang/lib/Headers/avx512fp16intrin.h
Expand Up @@ -2934,98 +2934,98 @@ _mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
__m128h __B,
__m128h __C) {
return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__C, (__v4sf)__A,
(__v4sf)__B, (__mmask8)-1,
return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
(__v4sf)__C, (__mmask8)-1,
_MM_FROUND_CUR_DIRECTION);
}

static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
return (__m128h)__builtin_ia32_selectps_128(
__U,
__builtin_ia32_vfcmaddcsh_mask((__v4sf)__C, (__v4sf)__A, (__v4sf)__B,
__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C,
(__mmask8)__U, _MM_FROUND_CUR_DIRECTION),
(__v4sf)__A);
}

static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__C, (__v4sf)__A,
(__v4sf)__B, (__mmask8)__U,
return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
(__v4sf)__C, (__mmask8)__U,
_MM_FROUND_CUR_DIRECTION);
}

static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
return (__m128h)_mm_move_ss((__m128)__C,
(__m128)__builtin_ia32_vfcmaddcsh_mask(
(__v4sf)__C, (__v4sf)__A, (__v4sf)__B, __U,
(__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U,
_MM_FROUND_CUR_DIRECTION));
}

#define _mm_fcmadd_round_sch(A, B, C, R) \
((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
(__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
(__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
(__mmask8)-1, (int)(R)))

#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
((__m128h)__builtin_ia32_selectps_128( \
(__mmask8)(U & 1), \
__builtin_ia32_vfcmaddcsh_mask( \
(__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
(__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
(__mmask8)(U), (int)(R)), \
(__v4sf)(__m128h)(A)))

#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
(__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
(__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
(__mmask8)(U), (int)(R)))

#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
((__m128h)_mm_move_ss((__m128)(C), \
(__m128)__builtin_ia32_vfcmaddcsh_mask( \
(__v4sf)(C), (__v4sf)(A), (__v4sf)(B), (U), (R))))
(__v4sf)(A), (__v4sf)(B), (__v4sf)(C), (U), (R))))

static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
__m128h __B,
__m128h __C) {
return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__C, (__v4sf)__A,
(__v4sf)__B, (__mmask8)-1,
return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
(__v4sf)__C, (__mmask8)-1,
_MM_FROUND_CUR_DIRECTION);
}

static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
return (__m128h)__builtin_ia32_selectps_128(
__U,
__builtin_ia32_vfmaddcsh_mask((__v4sf)__C, (__v4sf)__A, (__v4sf)__B,
__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C,
(__mmask8)__U, _MM_FROUND_CUR_DIRECTION),
(__v4sf)__A);
}

static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__C, (__v4sf)__A,
(__v4sf)__B, (__mmask8)__U,
return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
(__v4sf)__C, (__mmask8)__U,
_MM_FROUND_CUR_DIRECTION);
}

#define _mm_fmadd_round_sch(A, B, C, R) \
((__m128h)__builtin_ia32_vfmaddcsh_mask( \
(__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
(__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
(__mmask8)-1, (int)(R)))

#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
((__m128h)__builtin_ia32_selectps_128( \
(__mmask8)(U & 1), \
__builtin_ia32_vfmaddcsh_mask( \
(__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
(__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
(__mmask8)(U), (int)(R)), \
(__v4sf)(__m128h)(A)))

#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
(__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
(__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
(__mmask8)(U), (int)(R)))

static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
Expand Down Expand Up @@ -3177,111 +3177,111 @@ _mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
__m512h __B,
__m512h __C) {
return (__m512h)__builtin_ia32_vfcmaddcph512_mask((__v16sf)__C, (__v16sf)__A,
(__v16sf)__B, (__mmask16)-1,
return (__m512h)__builtin_ia32_vfcmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
(__v16sf)__C, (__mmask16)-1,
_MM_FROUND_CUR_DIRECTION);
}

static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
return (__m512h)__builtin_ia32_selectps_512(
__U,
__builtin_ia32_vfcmaddcph512_mask((__v16sf)__C, (__v16sf)__A,
(__v16sf)__B, (__mmask16)__U,
__builtin_ia32_vfcmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
(__v16sf)__C, (__mmask16)__U,
_MM_FROUND_CUR_DIRECTION),
(__v16sf)__A);
}

static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
(__v16sf)__C, (__v16sf)__A, (__v16sf)__B, (__mmask16)__U,
(__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
_MM_FROUND_CUR_DIRECTION);
}

static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
(__v16sf)__C, (__v16sf)__A, (__v16sf)__B, (__mmask16)__U,
(__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
_MM_FROUND_CUR_DIRECTION);
}

#define _mm512_fcmadd_round_pch(A, B, C, R) \
((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
(__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
(__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
(__mmask16)-1, (int)(R)))

#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
((__m512h)__builtin_ia32_selectps_512( \
(__mmask16)(U), \
__builtin_ia32_vfcmaddcph512_mask( \
(__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
(__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
(__mmask16)(U), (int)(R)), \
(__v16sf)(__m512h)(A)))

#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
(__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
(__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
(__mmask16)(U), (int)(R)))

#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
(__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
(__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
(__mmask16)(U), (int)(R)))

static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
__m512h __B,
__m512h __C) {
return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__C, (__v16sf)__A,
(__v16sf)__B, (__mmask16)-1,
return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
(__v16sf)__C, (__mmask16)-1,
_MM_FROUND_CUR_DIRECTION);
}

static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
return (__m512h)__builtin_ia32_selectps_512(
__U,
__builtin_ia32_vfmaddcph512_mask((__v16sf)__C, (__v16sf)__A, (__v16sf)__B,
__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__C,
(__mmask16)__U,
_MM_FROUND_CUR_DIRECTION),
(__v16sf)__A);
}

static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__C, (__v16sf)__A,
(__v16sf)__B, (__mmask16)__U,
return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
(__v16sf)__C, (__mmask16)__U,
_MM_FROUND_CUR_DIRECTION);
}

static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
(__v16sf)__C, (__v16sf)__A, (__v16sf)__B, (__mmask16)__U,
(__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
_MM_FROUND_CUR_DIRECTION);
}

#define _mm512_fmadd_round_pch(A, B, C, R) \
((__m512h)__builtin_ia32_vfmaddcph512_mask( \
(__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
(__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
(__mmask16)-1, (int)(R)))

#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
((__m512h)__builtin_ia32_selectps_512( \
(__mmask16)(U), \
__builtin_ia32_vfmaddcph512_mask( \
(__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
(__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
(__mmask16)(U), (int)(R)), \
(__v16sf)(__m512h)(A)))

#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
((__m512h)__builtin_ia32_vfmaddcph512_mask( \
(__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
(__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
(__mmask16)(U), (int)(R)))

#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
(__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
(__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
(__mmask16)(U), (int)(R)))

static __inline__ _Float16 __DEFAULT_FN_ATTRS512
Expand Down

0 comments on commit ebec077

Please sign in to comment.