| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,382 @@ | ||
| /*===--------------------- amxavx512intrin.h - AMXAVX512 --------------------=== | ||
| * | ||
| * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| * See https://llvm.org/LICENSE.txt for license information. | ||
| * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| * | ||
| *===------------------------------------------------------------------------=== | ||
| */ | ||
| #ifndef __IMMINTRIN_H | ||
| #error "Never use <amxavx512intrin.h> directly; include <immintrin.h> instead." | ||
| #endif // __IMMINTRIN_H | ||
|
|
||
| #ifndef __AMX_AVX512INTRIN_H | ||
| #define __AMX_AVX512INTRIN_H | ||
| #ifdef __x86_64__ | ||
|
|
||
| #define __DEFAULT_FN_ATTRS_AVX512 \ | ||
| __attribute__((__always_inline__, __nodebug__, \ | ||
| __target__("amx-avx512,avx10.2-512"))) | ||
|
|
||
| /// Moves a row from a tile register to a zmm destination register, converting | ||
| /// the int32 source elements to fp32. The row of the tile is selected by a | ||
| /// 32b GPR. | ||
| /// | ||
| /// \headerfile <x86intrin.h> | ||
| /// | ||
| /// \code | ||
| /// __m512i _tile_cvtrowd2ps(__tile tsrc, unsigned int row); | ||
| /// \endcode | ||
| /// | ||
| /// \code{.operation} | ||
| /// VL := 512 | ||
| /// VL_bytes := VL >> 3 | ||
| /// row_index := row & 0xffff | ||
| /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes | ||
| /// FOR i := 0 TO (VL_bytes / 4) - 1 | ||
| /// IF i + row_chunk / 4 >= tsrc.colsb / 4 | ||
| /// dst.dword[i] := 0 | ||
| /// ELSE | ||
| /// dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE) | ||
| /// FI | ||
| /// ENDFOR | ||
| /// dst[MAX_VL-1:VL] := 0 | ||
| /// zero_tileconfig_start() | ||
| /// \endcode | ||
| /// | ||
| /// This intrinsic corresponds to the \c TCVTROWD2PS instruction. | ||
| /// | ||
| /// \param tsrc | ||
| /// The source tile. Max size is 1024 Bytes. | ||
| /// \param row | ||
| /// The row of the source tile | ||
| #define _tile_cvtrowd2ps(tsrc, row) __builtin_ia32_tcvtrowd2ps(tsrc, row) | ||
|
|
||
| /// Moves a row from a tile register to a zmm destination register, converting | ||
| /// the fp32 source elements to bf16. It places the resulting bf16 elements | ||
| /// in the high 16 bits within each dword. The row of the tile is selected | ||
| /// by a 32b GPR. | ||
| /// | ||
| /// \headerfile <x86intrin.h> | ||
| /// | ||
| /// \code | ||
| /// __m512i _tile_cvtrowps2pbf16h(__tile tsrc, unsigned int row); | ||
| /// \endcode | ||
| /// | ||
| /// \code{.operation} | ||
| /// VL := 512 | ||
| /// VL_bytes := VL >> 3 | ||
| /// row_index := row & 0xffff | ||
| /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes | ||
| /// FOR i := 0 TO (VL_bytes / 4) - 1 | ||
| /// IF i + row_chunk / 4 >= tsrc.colsb / 4 | ||
| /// dst.dword[i] := 0 | ||
| /// ELSE | ||
| /// dst.word[2*i+0] := 0 | ||
| /// dst.bf16[2*i+1] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) | ||
| /// FI | ||
| /// ENDFOR | ||
| /// dst[MAX_VL-1:VL] := 0 | ||
| /// zero_tileconfig_start() | ||
| /// \endcode | ||
| /// | ||
| /// This intrinsic corresponds to the \c TCVTROWPS2PBF16H instruction. | ||
| /// | ||
| /// \param tsrc | ||
| /// The source tile. Max size is 1024 Bytes. | ||
| /// \param row | ||
| /// The the row of the source tile. | ||
| #define _tile_cvtrowps2pbf16h(tsrc, row) \ | ||
| __builtin_ia32_tcvtrowps2pbf16h(tsrc, row) | ||
|
|
||
| /// Moves a row from a tile register to a zmm destination register, converting | ||
| /// the fp32 source elements to bf16. It places the resulting bf16 elements | ||
| /// in the low 16 bits within each dword. The row of the tile is selected | ||
| /// by a 32b GPR. | ||
| /// | ||
| /// \headerfile <x86intrin.h> | ||
| /// | ||
| /// \code | ||
| /// __m512i _tile_cvtrowps2pbf16l(__tile tsrc, unsigned int row); | ||
| /// \endcode | ||
| /// | ||
| /// \code{.operation} | ||
| /// VL := 512 | ||
| /// VL_bytes := VL >> 3 | ||
| /// row_index := row & 0xffff | ||
| /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes | ||
| /// FOR i := 0 TO (VL_bytes / 4) - 1 | ||
| /// IF i + row_chunk / 4 >= tsrc.colsb / 4 | ||
| /// dst.dword[i] := 0 | ||
| /// ELSE | ||
| /// dst.word[2*i+1] := 0 | ||
| /// dst.bf16[2*i+0] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) | ||
| /// FI | ||
| /// ENDFOR | ||
| /// dst[MAX_VL-1:VL] := 0 | ||
| /// zero_tileconfig_start() | ||
| /// \endcode | ||
| /// | ||
| /// This intrinsic corresponds to the \c TCVTROWPS2PBF16L instruction. | ||
| /// | ||
| /// \param tsrc | ||
| /// The source tile. Max size is 1024 Bytes. | ||
| /// \param row | ||
| /// The the row of the source tile. | ||
| #define _tile_cvtrowps2pbf16l(tsrc, row) \ | ||
| __builtin_ia32_tcvtrowps2pbf16l(tsrc, row) | ||
|
|
||
| /// Moves a row from a tile register to a zmm destination register, converting | ||
| /// the fp32 source elements to fp16. It places the resulting fp16 elements | ||
| /// in the high 16 bits within each dword. The row of the tile is selected | ||
| /// by a 32b GPR. | ||
| /// | ||
| /// \headerfile <x86intrin.h> | ||
| /// | ||
| /// \code | ||
| /// __m512i _tile_cvtrowps2phh(__tile tsrc, unsigned int row); | ||
| /// \endcode | ||
| /// | ||
| /// \code{.operation} | ||
| /// VL := 512 | ||
| /// VL_bytes := VL >> 3 | ||
| /// row_index := row & 0xffff | ||
| /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes | ||
| /// FOR i := 0 TO (VL_bytes / 4) - 1 | ||
| /// IF i + row_chunk / 4 >= tsrc.colsb / 4 | ||
| /// dst.dword[i] := 0 | ||
| /// ELSE | ||
| /// dst.word[2*i+0] := 0 | ||
| /// dst.fp16[2*i+1] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) | ||
| /// FI | ||
| /// ENDFOR | ||
| /// dst[MAX_VL-1:VL] := 0 | ||
| /// zero_tileconfig_start() | ||
| /// \endcode | ||
| /// | ||
| /// This intrinsic corresponds to the \c TCVTROWPS2PHH instruction. | ||
| /// | ||
| /// \param tsrc | ||
| /// The source tile. Max size is 1024 Bytes. | ||
| /// \param row | ||
| /// The the row of the source tile. | ||
| #define _tile_cvtrowps2phh(tsrc, row) __builtin_ia32_tcvtrowps2phh(tsrc, row) | ||
|
|
||
| /// Moves a row from a tile register to a zmm destination register, converting | ||
| /// the fp32 source elements to fp16. It places the resulting fp16 elements | ||
| /// in the low 16 bits within each dword. The row of the tile is selected | ||
| /// by a 32b GPR. | ||
| /// | ||
| /// \headerfile <x86intrin.h> | ||
| /// | ||
| /// \code | ||
| /// __m512i _tile_cvtrowps2phl(__tile tsrc, unsigned int row); | ||
| /// \endcode | ||
| /// | ||
| /// \code{.operation} | ||
| /// VL := 512 | ||
| /// VL_bytes := VL >> 3 | ||
| /// row_index := row & 0xffff | ||
| /// row_chunk := ((row >> 16) & 0xffff) * VL_bytes | ||
| /// FOR i := 0 TO (VL_bytes / 4) - 1 | ||
| /// IF i + row_chunk / 4 >= tsrc.colsb / 4 | ||
| /// dst.dword[i] := 0 | ||
| /// ELSE | ||
| /// dst.word[2*i+1] := 0 | ||
| /// dst.fp16[2*i+0] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) | ||
| /// FI | ||
| /// ENDFOR | ||
| /// dst[MAX_VL-1:VL] := 0 | ||
| /// zero_tileconfig_start() | ||
| /// \endcode | ||
| /// | ||
| /// This intrinsic corresponds to the \c TCVTROWPS2PHL instruction. | ||
| /// | ||
| /// \param tsrc | ||
| /// The source tile. Max size is 1024 Bytes. | ||
| /// \param row | ||
| /// The the row of the source tile. | ||
| #define _tile_cvtrowps2phl(tsrc, row) __builtin_ia32_tcvtrowps2phl(tsrc, row) | ||
|
|
||
| /// Move one row of a tile data to a v16f32 data. | ||
| /// The row of the tile is selected by a 32b GPR. | ||
| /// | ||
| /// \headerfile <immintrin.h> | ||
| /// | ||
| /// \code | ||
| /// __m512 _tile_movrow(__tile a, unsigned b); | ||
| /// \endcode | ||
| /// | ||
| /// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction. | ||
| /// | ||
| /// \param a | ||
| /// The 1st source tile. Max size is 1024 Bytes. | ||
| /// \param b | ||
| /// The 2nd source r32. Size is 4 Bytes. | ||
| /// \returns | ||
| /// The destination v16f32 data. Size is 64 Bytes. | ||
| /// | ||
| /// \code{.operation} | ||
| /// VL := 512 | ||
| /// VL_bytes := VL>>3 | ||
| /// row_index := b&0xffff | ||
| /// row_chunk := ((b>>16)&0xffff) * VL_bytes | ||
| /// FOR i := 0 TO (VL_bytes-1) | ||
| /// IF (row_chunk + i >= a.colsb) | ||
| /// dst.byte[i] := 0 | ||
| /// ELSE | ||
| /// dst.byte[i] := a.row[row_index].byte[row_chunk+i] | ||
| /// ENDFOR | ||
| /// \endcode | ||
| #define _tile_movrow(a, b) __builtin_ia32_tilemovrow(a, b) | ||
|
|
||
| /// This is internal intrinsic. C/C++ user should avoid calling it directly. | ||
|
|
||
| static __inline__ __m512 __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowd2ps_internal( | ||
| unsigned short m, unsigned short n, _tile1024i src, unsigned u) { | ||
| return __builtin_ia32_tcvtrowd2ps_internal(m, n, src, u); | ||
| } | ||
|
|
||
| static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512 | ||
| _tile_cvtrowps2pbf16h_internal(unsigned short m, unsigned short n, | ||
| _tile1024i src, unsigned u) { | ||
| return __builtin_ia32_tcvtrowps2pbf16h_internal(m, n, src, u); | ||
| } | ||
|
|
||
| static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512 | ||
| _tile_cvtrowps2pbf16l_internal(unsigned short m, unsigned short n, | ||
| _tile1024i src, unsigned u) { | ||
| return __builtin_ia32_tcvtrowps2pbf16l_internal(m, n, src, u); | ||
| } | ||
|
|
||
| static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phh_internal( | ||
| unsigned short m, unsigned short n, _tile1024i src, unsigned u) { | ||
| return __builtin_ia32_tcvtrowps2phh_internal(m, n, src, u); | ||
| } | ||
|
|
||
| static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phl_internal( | ||
| unsigned short m, unsigned short n, _tile1024i src, unsigned u) { | ||
| return __builtin_ia32_tcvtrowps2phl_internal(m, n, src, u); | ||
| } | ||
|
|
||
| static __inline__ __m512i __DEFAULT_FN_ATTRS_AVX512 _tile_movrow_internal( | ||
| unsigned short m, unsigned short n, _tile1024i src, unsigned u) { | ||
| return (__m512i)__builtin_ia32_tilemovrow_internal(m, n, src, u); | ||
| } | ||
|
|
||
| /// Move a row from a tile (src0) to a v16f32 dst, converting the int32 source | ||
| /// elements to fp32. No SIMD exceptions are generated. Rounding is done as if | ||
| /// MXCSR.RC=RNE. Embedded rounding is not supported. | ||
| /// The row and chunk elements of tile is fetched from 32bit src1. | ||
| /// | ||
| /// \headerfile <immintrin.h> | ||
| /// | ||
| /// This intrinsic corresponds to the <c> TCVTROWD2PS </c> instruction. | ||
| /// | ||
| /// \param src0 | ||
| /// The 1st source tile. Max size is 1024 Bytes. | ||
| /// \param src1 | ||
| /// The 2nd source r32. Size is 4 Bytes. | ||
| /// \returns | ||
| /// The destination v16f32 data. Size is 64 Bytes. | ||
| __DEFAULT_FN_ATTRS_AVX512 | ||
| static __m512 __tile_cvtrowd2ps(__tile1024i src0, unsigned src1) { | ||
| return _tile_cvtrowd2ps_internal(src0.row, src0.col, src0.tile, src1); | ||
| } | ||
|
|
||
| /// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source | ||
| /// elements to bf16 at high 16-bits of each dword. | ||
| /// The row and chunk elements of tile is fetched from 32bit src1. | ||
| /// | ||
| /// \headerfile <immintrin.h> | ||
| /// | ||
| /// This intrinsic corresponds to the <c> TCVTROWPS2PBF16H </c> instruction. | ||
| /// | ||
| /// \param src0 | ||
| /// The 1st source tile. Max size is 1024 Bytes. | ||
| /// \param src1 | ||
| /// The 2nd source r32. Size is 4 Bytes. | ||
| /// \returns | ||
| /// The destination v32bf16 data. Size is 64 Bytes. | ||
| __DEFAULT_FN_ATTRS_AVX512 | ||
| static __m512bh __tile_cvtrowps2pbf16h(__tile1024i src0, unsigned src1) { | ||
| return _tile_cvtrowps2pbf16h_internal(src0.row, src0.col, src0.tile, src1); | ||
| } | ||
|
|
||
| /// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source | ||
| /// elements to bf16 at low 16-bits of each dword. | ||
| /// The row and chunk elements of tile is fetched from 32bit src1. | ||
| /// | ||
| /// \headerfile <immintrin.h> | ||
| /// | ||
| /// This intrinsic corresponds to the <c> TCVTROWPS2PBF16L </c> instruction. | ||
| /// | ||
| /// \param src0 | ||
| /// The 1st source tile. Max size is 1024 Bytes. | ||
| /// \param src1 | ||
| /// The 2nd source r32. Size is 4 Bytes. | ||
| /// \returns | ||
| /// The destination v32bf16 data. Size is 64 Bytes. | ||
| __DEFAULT_FN_ATTRS_AVX512 | ||
| static __m512bh __tile_cvtrowps2pbf16l(__tile1024i src0, unsigned src1) { | ||
| return _tile_cvtrowps2pbf16l_internal(src0.row, src0.col, src0.tile, src1); | ||
| } | ||
|
|
||
| /// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source | ||
| /// elements to fp16 at high 16-bits of each dword. | ||
| /// The row and chunk elements of tile is fetched from 32bit src1. | ||
| /// | ||
| /// \headerfile <immintrin.h> | ||
| /// | ||
| /// This intrinsic corresponds to the <c> TCVTROWPS2PHH </c> instruction. | ||
| /// | ||
| /// \param src0 | ||
| /// The 1st source tile. Max size is 1024 Bytes. | ||
| /// \param src1 | ||
| /// The 2nd source r32. Size is 4 Bytes. | ||
| /// \returns | ||
| /// The destination v32fp16 data. Size is 64 Bytes. | ||
| __DEFAULT_FN_ATTRS_AVX512 | ||
| static __m512h __tile_cvtrowps2phh(__tile1024i src0, unsigned src1) { | ||
| return _tile_cvtrowps2phh_internal(src0.row, src0.col, src0.tile, src1); | ||
| } | ||
|
|
||
| /// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source | ||
| /// elements to fp16 at low 16-bits of each dword. | ||
| /// The row and chunk elements of tile is fetched from 32bit src1. | ||
| /// | ||
| /// \headerfile <immintrin.h> | ||
| /// | ||
| /// This intrinsic corresponds to the <c> TCVTROWPS2PHL </c> instruction. | ||
| /// | ||
| /// \param src0 | ||
| /// The 1st source tile. Max size is 1024 Bytes. | ||
| /// \param src1 | ||
| /// The 2nd source r32. Size is 4 Bytes. | ||
| /// \returns | ||
| /// The destination v32fp16 data. Size is 64 Bytes. | ||
| __DEFAULT_FN_ATTRS_AVX512 | ||
| static __m512h __tile_cvtrowps2phl(__tile1024i src0, unsigned src1) { | ||
| return _tile_cvtrowps2phl_internal(src0.row, src0.col, src0.tile, src1); | ||
| } | ||
|
|
||
| /// Move one row of a tile data to a v16f32 data. | ||
| /// The row of the tile is selected by a 32b GPR. | ||
| /// | ||
| /// \headerfile <immintrin.h> | ||
| /// | ||
| /// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction. | ||
| /// | ||
| /// \param src0 | ||
| /// The 1st source tile. Max size is 1024 Bytes. | ||
| /// \param src1 | ||
| /// The 2nd source r32. Size is 4 Bytes. | ||
| /// \returns | ||
| /// The destination v16i32 data. Size is 64 Bytes. | ||
| __DEFAULT_FN_ATTRS_AVX512 | ||
| static __m512i __tile_movrow(__tile1024i src0, unsigned src1) { | ||
| return (__m512i)_tile_movrow_internal(src0.row, src0.col, src0.tile, src1); | ||
| } | ||
|
|
||
| #endif // __x86_64__ | ||
| #endif // __AMX_AVX512INTRIN_H |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,52 @@ | ||
| // RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown \ | ||
| // RUN: -target-feature +amx-avx512 -target-feature +avx10.2-512 \ | ||
| // RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK | ||
|
|
||
| #include <immintrin.h> | ||
|
|
||
| char buf[1024]; | ||
| #define STRIDE 32 | ||
|
|
||
| char buf2[1024]; | ||
|
|
||
| __m512 test_tile_cvtrowd2ps(__tile1024i a, unsigned b) { | ||
| //CHECK-LABEL: @test_tile_cvtrowd2ps | ||
| //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) | ||
| //CHECK-DAG: call <16 x float> @llvm.x86.tcvtrowd2ps.internal | ||
| return __tile_cvtrowd2ps(a, b); | ||
| } | ||
|
|
||
| __m512bh test_tile_cvtrowps2pbf16h(__tile1024i a, unsigned b) { | ||
| //CHECK-LABEL: @test_tile_cvtrowps2pbf16h | ||
| //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) | ||
| //CHECK-DAG: call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16h.internal | ||
| return __tile_cvtrowps2pbf16h(a, b); | ||
| } | ||
|
|
||
| __m512bh test_tile_cvtrowps2pbf16l(__tile1024i a, unsigned b) { | ||
| //CHECK-LABEL: @test_tile_cvtrowps2pbf16l | ||
| //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) | ||
| //CHECK-DAG: call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16l.internal | ||
| return __tile_cvtrowps2pbf16l(a, b); | ||
| } | ||
|
|
||
| __m512h test_tile_cvtrowps2phh(__tile1024i a, unsigned b) { | ||
| //CHECK-LABEL: @test_tile_cvtrowps2phh | ||
| //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) | ||
| //CHECK-DAG: call <32 x half> @llvm.x86.tcvtrowps2phh.internal | ||
| return __tile_cvtrowps2phh(a, b); | ||
| } | ||
|
|
||
| __m512h test_tile_cvtrowps2phl(__tile1024i a, unsigned b) { | ||
| //CHECK-LABEL: @test_tile_cvtrowps2phl | ||
| //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) | ||
| //CHECK-DAG: call <32 x half> @llvm.x86.tcvtrowps2phl.internal | ||
| return __tile_cvtrowps2phl(a, b); | ||
| } | ||
|
|
||
| __m512i test_tile_movrow(__tile1024i a, unsigned b) { | ||
| //CHECK-LABEL: @test_tile_movrow | ||
| //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) | ||
| //CHECK-DAG: call <16 x i32> @llvm.x86.tilemovrow.internal | ||
| return __tile_movrow(a, b); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,41 @@ | ||
| // RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-tile -target-feature +amx-avx512 \ | ||
| // RUN: -target-feature +avx10.2-512 -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression | FileCheck %s | ||
|
|
||
| #include <immintrin.h> | ||
| #include <stddef.h> | ||
|
|
||
| __m512 test_tile_cvtrowd2ps(unsigned int A) { | ||
| // CHECK-LABEL: @test_tile_cvtrowd2ps( | ||
| // CHECK: call <16 x float> @llvm.x86.tcvtrowd2ps(i8 1, i32 %{{.*}}) | ||
| return _tile_cvtrowd2ps(1, A); | ||
| } | ||
|
|
||
| __m512bh test_tile_cvtrowps2pbf16h(unsigned int A) { | ||
| // CHECK-LABEL: @test_tile_cvtrowps2pbf16h( | ||
| // CHECK: call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16h(i8 1, i32 %{{.*}}) | ||
| return _tile_cvtrowps2pbf16h(1, A); | ||
| } | ||
|
|
||
| __m512bh test_tile_cvtrowps2pbf16l(unsigned int A) { | ||
| // CHECK-LABEL: @test_tile_cvtrowps2pbf16l( | ||
| // CHECK: call <32 x bfloat> @llvm.x86.tcvtrowps2pbf16l(i8 1, i32 %{{.*}}) | ||
| return _tile_cvtrowps2pbf16l(1, A); | ||
| } | ||
|
|
||
| __m512h test_tile_cvtrowps2phh(unsigned int A) { | ||
| // CHECK-LABEL: @test_tile_cvtrowps2phh( | ||
| // CHECK: call <32 x half> @llvm.x86.tcvtrowps2phh(i8 1, i32 %{{.*}}) | ||
| return _tile_cvtrowps2phh(1, A); | ||
| } | ||
|
|
||
| __m512h test_tile_cvtrowps2phl(unsigned int A) { | ||
| // CHECK-LABEL: @test_tile_cvtrowps2phl( | ||
| // CHECK: call <32 x half> @llvm.x86.tcvtrowps2phl(i8 1, i32 %{{.*}}) | ||
| return _tile_cvtrowps2phl(1, A); | ||
| } | ||
|
|
||
| __m512i test_tile_movrow(unsigned int A) { | ||
| // CHECK-LABEL: @test_tile_movrow | ||
| // CHECK: %1 = call <16 x i32> @llvm.x86.tilemovrow(i8 1, i32 %{{.*}}) | ||
| return _tile_movrow(1, A); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \ | ||
| // RUN: dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \ | ||
| // RUN: FileCheck %s -DTARGET=dx | ||
| // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \ | ||
| // RUN: spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \ | ||
| // RUN: FileCheck %s -DTARGET=spv | ||
|
|
||
| // Test basic lowering to runtime function call. | ||
|
|
||
| // CHECK-LABEL: test_bool | ||
| int test_bool(bool expr) { | ||
| // CHECK: call {{.*}} @llvm.[[TARGET]].wave.active.countbits | ||
| return WaveActiveCountBits(expr); | ||
| } | ||
|
|
||
| // CHECK: declare i32 @llvm.[[TARGET]].wave.active.countbits(i1) #[[#attr:]] | ||
|
|
||
| // CHECK: attributes #[[#attr]] = {{{.*}} convergent {{.*}}} |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,8 @@ | ||
| // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s | ||
|
|
||
| // CHECK-LABEL: builtin_test_clamp_int4 | ||
| // CHECK: %hlsl.clamp = call <4 x i32> @llvm.dx.sclamp.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) | ||
| // CHECK: ret <4 x i32> %hlsl.clamp | ||
| int4 builtin_test_clamp_int4(int4 p0, int4 p1, int4 p2) { | ||
| return __builtin_hlsl_elementwise_clamp(p0, p1, p2); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,133 +1,143 @@ | ||
| // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ | ||
| // RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ | ||
| // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \ | ||
| // RUN: -DTARGET=dx -DFNATTRS=noundef | ||
| // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \ | ||
| // RUN: -emit-llvm -disable-llvm-passes -o - | \ | ||
| // RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF \ | ||
| // RUN: -DTARGET=dx -DFNATTRS=noundef | ||
| // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \ | ||
| // RUN: -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \ | ||
| // RUN: FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \ | ||
| // RUN: -DTARGET=spv -DFNATTRS="spir_func noundef" | ||
| // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \ | ||
| // RUN: -emit-llvm -disable-llvm-passes -o - | \ | ||
| // RUN: FileCheck %s --check-prefixes=CHECK,NO_HALF \ | ||
| // RUN: -DTARGET=spv -DFNATTRS="spir_func noundef" | ||
|
|
||
| #ifdef __HLSL_ENABLE_16_BIT | ||
| // NATIVE_HALF: define [[FNATTRS]] i16 @_Z16test_clamp_short | ||
| // NATIVE_HALF: call i16 @llvm.[[TARGET]].sclamp.i16( | ||
| int16_t test_clamp_short(int16_t p0, int16_t p1) { return clamp(p0, p1,p1); } | ||
| // NATIVE_HALF: define [[FNATTRS]] <2 x i16> @_Z17test_clamp_short2 | ||
| // NATIVE_HALF: call <2 x i16> @llvm.[[TARGET]].sclamp.v2i16( | ||
| int16_t2 test_clamp_short2(int16_t2 p0, int16_t2 p1) { return clamp(p0, p1,p1); } | ||
| // NATIVE_HALF: define [[FNATTRS]] <3 x i16> @_Z17test_clamp_short3 | ||
| // NATIVE_HALF: call <3 x i16> @llvm.[[TARGET]].sclamp.v3i16 | ||
| int16_t3 test_clamp_short3(int16_t3 p0, int16_t3 p1) { return clamp(p0, p1,p1); } | ||
| // NATIVE_HALF: define [[FNATTRS]] <4 x i16> @_Z17test_clamp_short4 | ||
| // NATIVE_HALF: call <4 x i16> @llvm.[[TARGET]].sclamp.v4i16 | ||
| int16_t4 test_clamp_short4(int16_t4 p0, int16_t4 p1) { return clamp(p0, p1,p1); } | ||
|
|
||
| // NATIVE_HALF: define [[FNATTRS]] i16 @_Z17test_clamp_ushort | ||
| // NATIVE_HALF: call i16 @llvm.[[TARGET]].uclamp.i16( | ||
| uint16_t test_clamp_ushort(uint16_t p0, uint16_t p1) { return clamp(p0, p1,p1); } | ||
| // NATIVE_HALF: define [[FNATTRS]] <2 x i16> @_Z18test_clamp_ushort2 | ||
| // NATIVE_HALF: call <2 x i16> @llvm.[[TARGET]].uclamp.v2i16 | ||
| uint16_t2 test_clamp_ushort2(uint16_t2 p0, uint16_t2 p1) { return clamp(p0, p1,p1); } | ||
| // NATIVE_HALF: define [[FNATTRS]] <3 x i16> @_Z18test_clamp_ushort3 | ||
| // NATIVE_HALF: call <3 x i16> @llvm.[[TARGET]].uclamp.v3i16 | ||
| uint16_t3 test_clamp_ushort3(uint16_t3 p0, uint16_t3 p1) { return clamp(p0, p1,p1); } | ||
| // NATIVE_HALF: define [[FNATTRS]] <4 x i16> @_Z18test_clamp_ushort4 | ||
| // NATIVE_HALF: call <4 x i16> @llvm.[[TARGET]].uclamp.v4i16 | ||
| uint16_t4 test_clamp_ushort4(uint16_t4 p0, uint16_t4 p1) { return clamp(p0, p1,p1); } | ||
| #endif | ||
|
|
||
| // CHECK: define [[FNATTRS]] i32 @_Z14test_clamp_int | ||
| // CHECK: call i32 @llvm.[[TARGET]].sclamp.i32( | ||
| int test_clamp_int(int p0, int p1) { return clamp(p0, p1,p1); } | ||
| // CHECK: define [[FNATTRS]] <2 x i32> @_Z15test_clamp_int2 | ||
| // CHECK: call <2 x i32> @llvm.[[TARGET]].sclamp.v2i32 | ||
| int2 test_clamp_int2(int2 p0, int2 p1) { return clamp(p0, p1,p1); } | ||
| // CHECK: define [[FNATTRS]] <3 x i32> @_Z15test_clamp_int3 | ||
| // CHECK: call <3 x i32> @llvm.[[TARGET]].sclamp.v3i32 | ||
| int3 test_clamp_int3(int3 p0, int3 p1) { return clamp(p0, p1,p1); } | ||
| // CHECK: define [[FNATTRS]] <4 x i32> @_Z15test_clamp_int4 | ||
| // CHECK: call <4 x i32> @llvm.[[TARGET]].sclamp.v4i32 | ||
| int4 test_clamp_int4(int4 p0, int4 p1) { return clamp(p0, p1,p1); } | ||
|
|
||
| // CHECK: define [[FNATTRS]] i32 @_Z15test_clamp_uint | ||
| // CHECK: call i32 @llvm.[[TARGET]].uclamp.i32( | ||
| int test_clamp_uint(uint p0, uint p1) { return clamp(p0, p1,p1); } | ||
| // CHECK: define [[FNATTRS]] <2 x i32> @_Z16test_clamp_uint2 | ||
| // CHECK: call <2 x i32> @llvm.[[TARGET]].uclamp.v2i32 | ||
| uint2 test_clamp_uint2(uint2 p0, uint2 p1) { return clamp(p0, p1,p1); } | ||
| // CHECK: define [[FNATTRS]] <3 x i32> @_Z16test_clamp_uint3 | ||
| // CHECK: call <3 x i32> @llvm.[[TARGET]].uclamp.v3i32 | ||
| uint3 test_clamp_uint3(uint3 p0, uint3 p1) { return clamp(p0, p1,p1); } | ||
| // CHECK: define [[FNATTRS]] <4 x i32> @_Z16test_clamp_uint4 | ||
| // CHECK: call <4 x i32> @llvm.[[TARGET]].uclamp.v4i32 | ||
| uint4 test_clamp_uint4(uint4 p0, uint4 p1) { return clamp(p0, p1,p1); } | ||
|
|
||
| // CHECK: define [[FNATTRS]] i64 @_Z15test_clamp_long | ||
| // CHECK: call i64 @llvm.[[TARGET]].sclamp.i64( | ||
| int64_t test_clamp_long(int64_t p0, int64_t p1) { return clamp(p0, p1,p1); } | ||
| // CHECK: define [[FNATTRS]] <2 x i64> @_Z16test_clamp_long2 | ||
| // CHECK: call <2 x i64> @llvm.[[TARGET]].sclamp.v2i64 | ||
| int64_t2 test_clamp_long2(int64_t2 p0, int64_t2 p1) { return clamp(p0, p1,p1); } | ||
| // CHECK: define [[FNATTRS]] <3 x i64> @_Z16test_clamp_long3 | ||
| // CHECK: call <3 x i64> @llvm.[[TARGET]].sclamp.v3i64 | ||
| int64_t3 test_clamp_long3(int64_t3 p0, int64_t3 p1) { return clamp(p0, p1,p1); } | ||
| // CHECK: define [[FNATTRS]] <4 x i64> @_Z16test_clamp_long4 | ||
| // CHECK: call <4 x i64> @llvm.[[TARGET]].sclamp.v4i64 | ||
| int64_t4 test_clamp_long4(int64_t4 p0, int64_t4 p1) { return clamp(p0, p1,p1); } | ||
|
|
||
| // CHECK: define [[FNATTRS]] i64 @_Z16test_clamp_ulong | ||
| // CHECK: call i64 @llvm.[[TARGET]].uclamp.i64( | ||
| uint64_t test_clamp_ulong(uint64_t p0, uint64_t p1) { return clamp(p0, p1,p1); } | ||
| // CHECK: define [[FNATTRS]] <2 x i64> @_Z17test_clamp_ulong2 | ||
| // CHECK: call <2 x i64> @llvm.[[TARGET]].uclamp.v2i64 | ||
| uint64_t2 test_clamp_ulong2(uint64_t2 p0, uint64_t2 p1) { return clamp(p0, p1,p1); } | ||
| // CHECK: define [[FNATTRS]] <3 x i64> @_Z17test_clamp_ulong3 | ||
| // CHECK: call <3 x i64> @llvm.[[TARGET]].uclamp.v3i64 | ||
| uint64_t3 test_clamp_ulong3(uint64_t3 p0, uint64_t3 p1) { return clamp(p0, p1,p1); } | ||
| // CHECK: define [[FNATTRS]] <4 x i64> @_Z17test_clamp_ulong4 | ||
| // CHECK: call <4 x i64> @llvm.[[TARGET]].uclamp.v4i64 | ||
| uint64_t4 test_clamp_ulong4(uint64_t4 p0, uint64_t4 p1) { return clamp(p0, p1,p1); } | ||
|
|
||
| // NATIVE_HALF: define [[FNATTRS]] half @_Z15test_clamp_half | ||
| // NATIVE_HALF: call half @llvm.[[TARGET]].nclamp.f16( | ||
| // NO_HALF: define [[FNATTRS]] float @_Z15test_clamp_half | ||
| // NO_HALF: call float @llvm.[[TARGET]].nclamp.f32( | ||
| half test_clamp_half(half p0, half p1) { return clamp(p0, p1,p1); } | ||
| // NATIVE_HALF: define [[FNATTRS]] <2 x half> @_Z16test_clamp_half2 | ||
| // NATIVE_HALF: call <2 x half> @llvm.[[TARGET]].nclamp.v2f16 | ||
| // NO_HALF: define [[FNATTRS]] <2 x float> @_Z16test_clamp_half2 | ||
| // NO_HALF: call <2 x float> @llvm.[[TARGET]].nclamp.v2f32( | ||
| half2 test_clamp_half2(half2 p0, half2 p1) { return clamp(p0, p1,p1); } | ||
| // NATIVE_HALF: define [[FNATTRS]] <3 x half> @_Z16test_clamp_half3 | ||
| // NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].nclamp.v3f16 | ||
| // NO_HALF: define [[FNATTRS]] <3 x float> @_Z16test_clamp_half3 | ||
| // NO_HALF: call <3 x float> @llvm.[[TARGET]].nclamp.v3f32( | ||
| half3 test_clamp_half3(half3 p0, half3 p1) { return clamp(p0, p1,p1); } | ||
| // NATIVE_HALF: define [[FNATTRS]] <4 x half> @_Z16test_clamp_half4 | ||
| // NATIVE_HALF: call <4 x half> @llvm.[[TARGET]].nclamp.v4f16 | ||
| // NO_HALF: define [[FNATTRS]] <4 x float> @_Z16test_clamp_half4 | ||
| // NO_HALF: call <4 x float> @llvm.[[TARGET]].nclamp.v4f32( | ||
| half4 test_clamp_half4(half4 p0, half4 p1) { return clamp(p0, p1,p1); } | ||
|
|
||
| // CHECK: define [[FNATTRS]] float @_Z16test_clamp_float | ||
| // CHECK: call float @llvm.[[TARGET]].nclamp.f32( | ||
| float test_clamp_float(float p0, float p1) { return clamp(p0, p1,p1); } | ||
| // CHECK: define [[FNATTRS]] <2 x float> @_Z17test_clamp_float2 | ||
| // CHECK: call <2 x float> @llvm.[[TARGET]].nclamp.v2f32 | ||
| float2 test_clamp_float2(float2 p0, float2 p1) { return clamp(p0, p1,p1); } | ||
| // CHECK: define [[FNATTRS]] <3 x float> @_Z17test_clamp_float3 | ||
| // CHECK: call <3 x float> @llvm.[[TARGET]].nclamp.v3f32 | ||
| float3 test_clamp_float3(float3 p0, float3 p1) { return clamp(p0, p1,p1); } | ||
| // CHECK: define [[FNATTRS]] <4 x float> @_Z17test_clamp_float4 | ||
| // CHECK: call <4 x float> @llvm.[[TARGET]].nclamp.v4f32 | ||
| float4 test_clamp_float4(float4 p0, float4 p1) { return clamp(p0, p1,p1); } | ||
|
|
||
| // CHECK: define [[FNATTRS]] double @_Z17test_clamp_double | ||
| // CHECK: call double @llvm.[[TARGET]].nclamp.f64( | ||
| double test_clamp_double(double p0, double p1) { return clamp(p0, p1,p1); } | ||
| // CHECK: define [[FNATTRS]] <2 x double> @_Z18test_clamp_double2 | ||
| // CHECK: call <2 x double> @llvm.[[TARGET]].nclamp.v2f64 | ||
| double2 test_clamp_double2(double2 p0, double2 p1) { return clamp(p0, p1,p1); } | ||
| // CHECK: define [[FNATTRS]] <3 x double> @_Z18test_clamp_double3 | ||
| // CHECK: call <3 x double> @llvm.[[TARGET]].nclamp.v3f64 | ||
| double3 test_clamp_double3(double3 p0, double3 p1) { return clamp(p0, p1,p1); } | ||
| // CHECK: define [[FNATTRS]] <4 x double> @_Z18test_clamp_double4 | ||
| // CHECK: call <4 x double> @llvm.[[TARGET]].nclamp.v4f64 | ||
| double4 test_clamp_double4(double4 p0, double4 p1) { return clamp(p0, p1,p1); } |