Skip to content

Commit

Permalink
TST, SIMD: add test cases for the new intrinics
Browse files Browse the repository at this point in the history
  • Loading branch information
seiko2plus committed Jan 29, 2023
1 parent e9e8582 commit 32af803
Show file tree
Hide file tree
Showing 2 changed files with 278 additions and 107 deletions.
112 changes: 92 additions & 20 deletions numpy/core/src/_simd/_simd.dispatch.c.src
Expand Up @@ -42,23 +42,26 @@
*/
SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, q@sfx@)
/**end repeat1**/
SIMD_IMPL_INTRIN_1(load_@sfx@x2, v@sfx@x2, q@sfx@)

/**begin repeat1
* # intrin = store, storea, stores, storel, storeh#
* # intrin = store, storea, stores, storel, storeh, store#
* # x = ,,,,, x2#
*/
// special definition due to the nature of @intrin@
static PyObject *
simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
simd__intrin_@intrin@_@sfx@@x@(PyObject* NPY_UNUSED(self), PyObject *args)
{
simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
simd_arg vec_arg = {.dtype = simd_data_v@sfx@@x@};
if (!PyArg_ParseTuple(
args, "O&O&:@intrin@_@sfx@",
args, "O&O&:@intrin@_@sfx@@x@",
simd_arg_converter, &seq_arg,
simd_arg_converter, &vec_arg
)) {
return NULL;
}
npyv_@intrin@_@sfx@(seq_arg.data.q@sfx@, vec_arg.data.v@sfx@);
npyv_@intrin@_@sfx@@x@(seq_arg.data.q@sfx@, vec_arg.data.v@sfx@@x@);
// write-back
if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) {
simd_arg_free(&seq_arg);
Expand All @@ -76,23 +79,35 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
// Partial Load
SIMD_IMPL_INTRIN_3(load_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@)
SIMD_IMPL_INTRIN_2(load_tillz_@sfx@, v@sfx@, q@sfx@, u32)
#if @size@ == 32
SIMD_IMPL_INTRIN_4(load2_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@, @sfx@)
SIMD_IMPL_INTRIN_2(load2_tillz_@sfx@, v@sfx@, q@sfx@, u32)
#else
SIMD_IMPL_INTRIN_4(load2_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@, @sfx@)
SIMD_IMPL_INTRIN_2(load2_tillz_@sfx@, v@sfx@, q@sfx@, u32)
#endif

// Partial Store
/**begin repeat1
* #intrin = store_till, store2_till, store2_till#
* #chksize= 0, 32, 64#
*/
#if !@chksize@ || @chksize@ == @size@
static PyObject *
simd__intrin_store_till_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
{
simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
simd_arg nlane_arg = {.dtype = simd_data_u32};
simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
if (!PyArg_ParseTuple(
args, "O&O&O&:store_till_@sfx@",
args, "O&O&O&:@intrin@_@sfx@",
simd_arg_converter, &seq_arg,
simd_arg_converter, &nlane_arg,
simd_arg_converter, &vec_arg
)) {
return NULL;
}
npyv_store_till_@sfx@(
npyv_@intrin@_@sfx@(
seq_arg.data.q@sfx@, nlane_arg.data.u32, vec_arg.data.v@sfx@
);
// write-back
Expand All @@ -103,14 +118,22 @@ simd__intrin_store_till_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
simd_arg_free(&seq_arg);
Py_RETURN_NONE;
}
#endif // chksize

/**end repeat1**/
// Non-contiguous Load
/**begin repeat1
* #intrin = loadn, loadn_till, loadn_tillz#
* #till = 0, 1, 1#
* #fill = 0, 1, 0#
* #format = , O&O&, O&#
*/
* #intrin = loadn, loadn2, loadn2,
* loadn_till, loadn2_till, loadn2_till,
* loadn_tillz, loadn2_tillz, loadn2_tillz#
* #scale = 1,2,2, 1,2,2, 1,2,2#
* #till = 0*3, 1*3, 1*3#
* #fill = 0*3, 1*3, 0*3#
# #fill2 = 0*3, 0,1,1, 0*3#
* #format = ,,, O&O&, O&O&O&*2,O&*3#
* #chksize= 0,32,64, 0,32,64, 0,32,64#
*/
#if !@chksize@ || @chksize@ == @size@
static PyObject *
simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
{
Expand All @@ -121,6 +144,9 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
#endif // till
#if @fill@
simd_arg fill_arg = {.dtype = simd_data_@sfx@};
#endif
#if @fill2@
simd_arg fill2_arg = {.dtype = simd_data_@sfx@};
#endif
if (!PyArg_ParseTuple(
args, "@format@O&O&:@intrin@_@sfx@",
Expand All @@ -131,6 +157,9 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
#endif
#if @fill@
,simd_arg_converter, &fill_arg
#endif
#if @fill2@
,simd_arg_converter, &fill2_arg
#endif
)) {
return NULL;
Expand All @@ -140,7 +169,7 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@;
if (stride < 0) {
seq_ptr += cur_seq_len -1;
seq_ptr += cur_seq_len - 1 * @scale@;
min_seq_len = -min_seq_len;
}
if (cur_seq_len < min_seq_len) {
Expand All @@ -159,6 +188,9 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
#if @fill@
, fill_arg.data.@sfx@
#endif
#if @fill2@
, fill2_arg.data.@sfx@
#endif
);
simd_arg ret = {
.dtype = simd_data_v@sfx@, .data = {.v@sfx@=rvec}
Expand All @@ -169,14 +201,19 @@ err:
simd_arg_free(&seq_arg);
return NULL;
}
#endif // chksize
/**end repeat1**/

// Non-contiguous Store
/**begin repeat1
* #intrin = storen, storen_till#
* #till = 0, 1#
* #format = , O&#
* #intrin = storen, storen2, storen2,
storen_till, storen2_till, storen2_till#
* #scale = 1,2,2, 1,2,2#
* #till = 0*3, 1*3#
* #format = ,,, O&*3#
* #chksize= 0,32,64, 0,32,64#
*/
#if !@chksize@ || @chksize@ == @size@
static PyObject *
simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
{
Expand All @@ -202,7 +239,7 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@;
if (stride < 0) {
seq_ptr += cur_seq_len -1;
seq_ptr += cur_seq_len - 1*@scale@;
min_seq_len = -min_seq_len;
}
// overflow guard
Expand Down Expand Up @@ -231,6 +268,7 @@ err:
simd_arg_free(&seq_arg);
return NULL;
}
#endif // chksize
/**end repeat1**/
#endif // @ncont_sup@

Expand Down Expand Up @@ -441,7 +479,7 @@ SIMD_IMPL_INTRIN_2(divc_@sfx@, v@sfx@, v@sfx@, v@sfx@x3)

#if @fused_sup@
/**begin repeat1
* #intrin = muladd, mulsub, nmuladd, nmulsub#
* #intrin = muladd, mulsub, nmuladd, nmulsub, muladdsub#
*/
SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@)
/**end repeat1**/
Expand Down Expand Up @@ -492,6 +530,11 @@ SIMD_IMPL_INTRIN_1(reduce_@intrin@_@sfx@, @sfx@, v@sfx@)
SIMD_IMPL_INTRIN_4(@intrin@_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@, v@sfx@)
/**end repeat1**/

#if @fp_only@
SIMD_IMPL_INTRIN_4(ifdiv_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@, v@sfx@)
SIMD_IMPL_INTRIN_3(ifdivz_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@)
#endif

#endif // simd_sup
/**end repeat**/
/*************************************************************************
Expand Down Expand Up @@ -595,6 +638,12 @@ static PyMethodDef simd__intrinsics_methods[] = {
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/

/**begin repeat1
* # intrin = load, store#
*/
SIMD_INTRIN_DEF(@intrin@_@sfx@x2)
/**end repeat1**/

/****************************************
* Non-contiguous/Partial Memory access
****************************************/
Expand All @@ -605,6 +654,21 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
*/
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
#if @size@ == 32
/**begin repeat1
* #intrin = load2_till, load2_tillz, loadn2, loadn2_till, loadn2_tillz,
* store2_till, storen2, storen2_till#
*/
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
#else
/**begin repeat1
* #intrin = load2_till, load2_tillz, loadn2, loadn2_till, loadn2_tillz,
* store2_till, storen2, storen2_till#
*/
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
#endif
#endif // ncont_sup

/****************************
Expand Down Expand Up @@ -716,7 +780,7 @@ SIMD_INTRIN_DEF(divc_@sfx@)

#if @fused_sup@
/**begin repeat1
* #intrin = muladd, mulsub, nmuladd, nmulsub#
* #intrin = muladd, mulsub, nmuladd, nmulsub, muladdsub#
*/
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
Expand Down Expand Up @@ -766,6 +830,14 @@ SIMD_INTRIN_DEF(reduce_@intrin@_@sfx@)
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/

#if @fp_only@
/**begin repeat1
* #intrin = ifdiv, ifdivz#
*/
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
#endif

#endif // simd_sup
/**end repeat**/
/*************************************************************************
Expand Down

0 comments on commit 32af803

Please sign in to comment.