Permalink
2889 lines (2598 sloc) 85.8 KB
/*
* This file contains the implementation of the 'einsum' function,
* which provides an einstein-summation operation.
*
* Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com)
* The University of British Columbia
*
* See LICENSE.txt for the license.
*/
#define PY_SSIZE_T_CLEAN
#include "Python.h"
#include "structmember.h"
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
#define _MULTIARRAYMODULE
#include <numpy/npy_common.h>
#include <numpy/arrayobject.h>
#include <numpy/halffloat.h>
#include <npy_pycompat.h>
#include <ctype.h>
#include "convert.h"
#include "common.h"
#include "ctors.h"
#ifdef NPY_HAVE_SSE_INTRINSICS
#define EINSUM_USE_SSE1 1
#else
#define EINSUM_USE_SSE1 0
#endif
/*
* TODO: Only some SSE2 for float64 is implemented.
*/
#ifdef NPY_HAVE_SSE2_INTRINSICS
#define EINSUM_USE_SSE2 1
#else
#define EINSUM_USE_SSE2 0
#endif
#if EINSUM_USE_SSE1
#include <xmmintrin.h>
#endif
#if EINSUM_USE_SSE2
#include <emmintrin.h>
#endif
#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0)
/********** PRINTF DEBUG TRACING **************/
#define NPY_EINSUM_DBG_TRACING 0
#if NPY_EINSUM_DBG_TRACING
#define NPY_EINSUM_DBG_PRINT(s) printf("%s", s);
#define NPY_EINSUM_DBG_PRINT1(s, p1) printf(s, p1);
#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) printf(s, p1, p2);
#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) printf(s);
#else
#define NPY_EINSUM_DBG_PRINT(s)
#define NPY_EINSUM_DBG_PRINT1(s, p1)
#define NPY_EINSUM_DBG_PRINT2(s, p1, p2)
#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3)
#endif
/**********************************************/
/**begin repeat
* #name = byte, short, int, long, longlong,
* ubyte, ushort, uint, ulong, ulonglong,
* half, float, double, longdouble,
* cfloat, cdouble, clongdouble#
* #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
* npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
* npy_half, npy_float, npy_double, npy_longdouble,
* npy_cfloat, npy_cdouble, npy_clongdouble#
* #temptype = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
* npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
* npy_float, npy_float, npy_double, npy_longdouble,
* npy_float, npy_double, npy_longdouble#
* #to = ,,,,,
* ,,,,,
* npy_float_to_half,,,,
* ,,#
* #from = ,,,,,
* ,,,,,
* npy_half_to_float,,,,
* ,,#
* #complex = 0*5,
* 0*5,
* 0*4,
* 1*3#
* #float32 = 0*5,
* 0*5,
* 0,1,0,0,
* 0*3#
* #float64 = 0*5,
* 0*5,
* 0,0,1,0,
* 0*3#
*/
/**begin repeat1
* #nop = 1, 2, 3, 1000#
* #noplabel = one, two, three, any#
*/
static void
@name@_sum_of_products_@noplabel@(int nop, char **dataptr,
npy_intp *strides, npy_intp count)
{
#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
char *data0 = dataptr[0];
npy_intp stride0 = strides[0];
#endif
#if (@nop@ == 2 || @nop@ == 3) && !@complex@
char *data1 = dataptr[1];
npy_intp stride1 = strides[1];
#endif
#if (@nop@ == 3) && !@complex@
char *data2 = dataptr[2];
npy_intp stride2 = strides[2];
#endif
#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
char *data_out = dataptr[@nop@];
npy_intp stride_out = strides[@nop@];
#endif
NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_@noplabel@ (%d)\n", (int)count);
while (count--) {
#if !@complex@
# if @nop@ == 1
*(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) +
@from@(*(@type@ *)data_out));
data0 += stride0;
data_out += stride_out;
# elif @nop@ == 2
*(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
@from@(*(@type@ *)data1) +
@from@(*(@type@ *)data_out));
data0 += stride0;
data1 += stride1;
data_out += stride_out;
# elif @nop@ == 3
*(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
@from@(*(@type@ *)data1) *
@from@(*(@type@ *)data2) +
@from@(*(@type@ *)data_out));
data0 += stride0;
data1 += stride1;
data2 += stride2;
data_out += stride_out;
# else
@temptype@ temp = @from@(*(@type@ *)dataptr[0]);
int i;
for (i = 1; i < nop; ++i) {
temp *= @from@(*(@type@ *)dataptr[i]);
}
*(@type@ *)dataptr[nop] = @to@(temp +
@from@(*(@type@ *)dataptr[i]));
for (i = 0; i <= nop; ++i) {
dataptr[i] += strides[i];
}
# endif
#else /* complex */
# if @nop@ == 1
((@temptype@ *)data_out)[0] = ((@temptype@ *)data0)[0] +
((@temptype@ *)data_out)[0];
((@temptype@ *)data_out)[1] = ((@temptype@ *)data0)[1] +
((@temptype@ *)data_out)[1];
data0 += stride0;
data_out += stride_out;
# else
# if @nop@ <= 3
#define _SUMPROD_NOP @nop@
# else
#define _SUMPROD_NOP nop
# endif
@temptype@ re, im, tmp;
int i;
re = ((@temptype@ *)dataptr[0])[0];
im = ((@temptype@ *)dataptr[0])[1];
for (i = 1; i < _SUMPROD_NOP; ++i) {
tmp = re * ((@temptype@ *)dataptr[i])[0] -
im * ((@temptype@ *)dataptr[i])[1];
im = re * ((@temptype@ *)dataptr[i])[1] +
im * ((@temptype@ *)dataptr[i])[0];
re = tmp;
}
((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
for (i = 0; i <= _SUMPROD_NOP; ++i) {
dataptr[i] += strides[i];
}
#undef _SUMPROD_NOP
# endif
#endif
}
}
#if @nop@ == 1
static void
@name@_sum_of_products_contig_one(int nop, char **dataptr,
npy_intp *NPY_UNUSED(strides), npy_intp count)
{
@type@ *data0 = (@type@ *)dataptr[0];
@type@ *data_out = (@type@ *)dataptr[1];
NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_one (%d)\n",
(int)count);
/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
switch (count) {
/**begin repeat2
* #i = 6, 5, 4, 3, 2, 1, 0#
*/
case @i@+1:
#if !@complex@
data_out[@i@] = @to@(@from@(data0[@i@]) +
@from@(data_out[@i@]));
#else
((@temptype@ *)data_out + 2*@i@)[0] =
((@temptype@ *)data0 + 2*@i@)[0] +
((@temptype@ *)data_out + 2*@i@)[0];
((@temptype@ *)data_out + 2*@i@)[1] =
((@temptype@ *)data0 + 2*@i@)[1] +
((@temptype@ *)data_out + 2*@i@)[1];
#endif
/**end repeat2**/
case 0:
return;
}
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
/**begin repeat2
* #i = 0, 1, 2, 3, 4, 5, 6, 7#
*/
#if !@complex@
data_out[@i@] = @to@(@from@(data0[@i@]) +
@from@(data_out[@i@]));
#else /* complex */
((@temptype@ *)data_out + 2*@i@)[0] =
((@temptype@ *)data0 + 2*@i@)[0] +
((@temptype@ *)data_out + 2*@i@)[0];
((@temptype@ *)data_out + 2*@i@)[1] =
((@temptype@ *)data0 + 2*@i@)[1] +
((@temptype@ *)data_out + 2*@i@)[1];
#endif
/**end repeat2**/
data0 += 8;
data_out += 8;
}
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
#elif @nop@ == 2 && !@complex@
static void
@name@_sum_of_products_contig_two(int nop, char **dataptr,
npy_intp *NPY_UNUSED(strides), npy_intp count)
{
@type@ *data0 = (@type@ *)dataptr[0];
@type@ *data1 = (@type@ *)dataptr[1];
@type@ *data_out = (@type@ *)dataptr[2];
#if EINSUM_USE_SSE1 && @float32@
__m128 a, b;
#endif
NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
(int)count);
/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
switch (count) {
/**begin repeat2
* #i = 6, 5, 4, 3, 2, 1, 0#
*/
case @i@+1:
data_out[@i@] = @to@(@from@(data0[@i@]) *
@from@(data1[@i@]) +
@from@(data_out[@i@]));
/**end repeat2**/
case 0:
return;
}
#if EINSUM_USE_SSE1 && @float32@
/* Use aligned instructions if possible */
if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
EINSUM_IS_SSE_ALIGNED(data_out)) {
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
/**begin repeat2
* #i = 0, 4#
*/
a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
_mm_store_ps(data_out+@i@, b);
/**end repeat2**/
data0 += 8;
data1 += 8;
data_out += 8;
}
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
#endif
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
#if EINSUM_USE_SSE1 && @float32@
/**begin repeat2
* #i = 0, 4#
*/
a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
_mm_storeu_ps(data_out+@i@, b);
/**end repeat2**/
#else
/**begin repeat2
* #i = 0, 1, 2, 3, 4, 5, 6, 7#
*/
data_out[@i@] = @to@(@from@(data0[@i@]) *
@from@(data1[@i@]) +
@from@(data_out[@i@]));
/**end repeat2**/
#endif
data0 += 8;
data1 += 8;
data_out += 8;
}
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
/* Some extra specializations for the two operand case */
static void
@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
npy_intp *NPY_UNUSED(strides), npy_intp count)
{
@temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
@type@ *data1 = (@type@ *)dataptr[1];
@type@ *data_out = (@type@ *)dataptr[2];
#if EINSUM_USE_SSE1 && @float32@
__m128 a, b, value0_sse;
#elif EINSUM_USE_SSE2 && @float64@
__m128d a, b, value0_sse;
#endif
NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n",
(int)count);
/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
switch (count) {
/**begin repeat2
* #i = 6, 5, 4, 3, 2, 1, 0#
*/
case @i@+1:
data_out[@i@] = @to@(value0 *
@from@(data1[@i@]) +
@from@(data_out[@i@]));
/**end repeat2**/
case 0:
return;
}
#if EINSUM_USE_SSE1 && @float32@
value0_sse = _mm_set_ps1(value0);
/* Use aligned instructions if possible */
if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
/**begin repeat2
* #i = 0, 4#
*/
a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@));
b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
_mm_store_ps(data_out+@i@, b);
/**end repeat2**/
data1 += 8;
data_out += 8;
}
/* Finish off the loop */
if (count > 0) {
goto finish_after_unrolled_loop;
}
else {
return;
}
}
#elif EINSUM_USE_SSE2 && @float64@
value0_sse = _mm_set1_pd(value0);
/* Use aligned instructions if possible */
if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
/**begin repeat2
* #i = 0, 2, 4, 6#
*/
a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+@i@));
b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
_mm_store_pd(data_out+@i@, b);
/**end repeat2**/
data1 += 8;
data_out += 8;
}
/* Finish off the loop */
if (count > 0) {
goto finish_after_unrolled_loop;
}
else {
return;
}
}
#endif
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
#if EINSUM_USE_SSE1 && @float32@
/**begin repeat2
* #i = 0, 4#
*/
a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@));
b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
_mm_storeu_ps(data_out+@i@, b);
/**end repeat2**/
#elif EINSUM_USE_SSE2 && @float64@
/**begin repeat2
* #i = 0, 2, 4, 6#
*/
a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+@i@));
b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
_mm_storeu_pd(data_out+@i@, b);
/**end repeat2**/
#else
/**begin repeat2
* #i = 0, 1, 2, 3, 4, 5, 6, 7#
*/
data_out[@i@] = @to@(value0 *
@from@(data1[@i@]) +
@from@(data_out[@i@]));
/**end repeat2**/
#endif
data1 += 8;
data_out += 8;
}
/* Finish off the loop */
if (count > 0) {
goto finish_after_unrolled_loop;
}
}
static void
@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
npy_intp *NPY_UNUSED(strides), npy_intp count)
{
@type@ *data0 = (@type@ *)dataptr[0];
@temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
@type@ *data_out = (@type@ *)dataptr[2];
#if EINSUM_USE_SSE1 && @float32@
__m128 a, b, value1_sse;
#endif
NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n",
(int)count);
/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
switch (count) {
/**begin repeat2
* #i = 6, 5, 4, 3, 2, 1, 0#
*/
case @i@+1:
data_out[@i@] = @to@(@from@(data0[@i@])*
value1 +
@from@(data_out[@i@]));
/**end repeat2**/
case 0:
return;
}
#if EINSUM_USE_SSE1 && @float32@
value1_sse = _mm_set_ps1(value1);
/* Use aligned instructions if possible */
if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
/**begin repeat2
* #i = 0, 4#
*/
a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse);
b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
_mm_store_ps(data_out+@i@, b);
/**end repeat2**/
data0 += 8;
data_out += 8;
}
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
#endif
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
#if EINSUM_USE_SSE1 && @float32@
/**begin repeat2
* #i = 0, 4#
*/
a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse);
b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
_mm_storeu_ps(data_out+@i@, b);
/**end repeat2**/
#else
/**begin repeat2
* #i = 0, 1, 2, 3, 4, 5, 6, 7#
*/
data_out[@i@] = @to@(@from@(data0[@i@])*
value1 +
@from@(data_out[@i@]));
/**end repeat2**/
#endif
data0 += 8;
data_out += 8;
}
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
static void
@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
npy_intp *NPY_UNUSED(strides), npy_intp count)
{
@type@ *data0 = (@type@ *)dataptr[0];
@type@ *data1 = (@type@ *)dataptr[1];
@temptype@ accum = 0;
#if EINSUM_USE_SSE1 && @float32@
__m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && @float64@
__m128d a, accum_sse = _mm_setzero_pd();
#endif
NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n",
(int)count);
/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
switch (count) {
/**begin repeat2
* #i = 6, 5, 4, 3, 2, 1, 0#
*/
case @i@+1:
accum += @from@(data0[@i@]) * @from@(data1[@i@]);
/**end repeat2**/
case 0:
*(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
return;
}
#if EINSUM_USE_SSE1 && @float32@
/* Use aligned instructions if possible */
if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
_mm_prefetch(data0 + 512, _MM_HINT_T0);
_mm_prefetch(data1 + 512, _MM_HINT_T0);
/**begin repeat2
* #i = 0, 4#
*/
/*
* NOTE: This accumulation changes the order, so will likely
* produce slightly different results.
*/
a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
accum_sse = _mm_add_ps(accum_sse, a);
/**end repeat2**/
data0 += 8;
data1 += 8;
}
/* Add the four SSE values and put in accum */
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
accum_sse = _mm_add_ps(a, accum_sse);
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
accum_sse = _mm_add_ps(a, accum_sse);
_mm_store_ss(&accum, accum_sse);
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
#elif EINSUM_USE_SSE2 && @float64@
/* Use aligned instructions if possible */
if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
_mm_prefetch(data0 + 512, _MM_HINT_T0);
_mm_prefetch(data1 + 512, _MM_HINT_T0);
/**begin repeat2
* #i = 0, 2, 4, 6#
*/
/*
* NOTE: This accumulation changes the order, so will likely
* produce slightly different results.
*/
a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
accum_sse = _mm_add_pd(accum_sse, a);
/**end repeat2**/
data0 += 8;
data1 += 8;
}
/* Add the two SSE2 values and put in accum */
a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
accum_sse = _mm_add_pd(a, accum_sse);
_mm_store_sd(&accum, accum_sse);
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
#endif
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
#if EINSUM_USE_SSE1 && @float32@
_mm_prefetch(data0 + 512, _MM_HINT_T0);
_mm_prefetch(data1 + 512, _MM_HINT_T0);
/**begin repeat2
* #i = 0, 4#
*/
/*
* NOTE: This accumulation changes the order, so will likely
* produce slightly different results.
*/
a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
accum_sse = _mm_add_ps(accum_sse, a);
/**end repeat2**/
#elif EINSUM_USE_SSE2 && @float64@
_mm_prefetch(data0 + 512, _MM_HINT_T0);
_mm_prefetch(data1 + 512, _MM_HINT_T0);
/**begin repeat2
* #i = 0, 2, 4, 6#
*/
/*
* NOTE: This accumulation changes the order, so will likely
* produce slightly different results.
*/
a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
accum_sse = _mm_add_pd(accum_sse, a);
/**end repeat2**/
#else
/**begin repeat2
* #i = 0, 1, 2, 3, 4, 5, 6, 7#
*/
accum += @from@(data0[@i@]) * @from@(data1[@i@]);
/**end repeat2**/
#endif
data0 += 8;
data1 += 8;
}
#if EINSUM_USE_SSE1 && @float32@
/* Add the four SSE values and put in accum */
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
accum_sse = _mm_add_ps(a, accum_sse);
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
accum_sse = _mm_add_ps(a, accum_sse);
_mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && @float64@
/* Add the two SSE2 values and put in accum */
a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
accum_sse = _mm_add_pd(a, accum_sse);
_mm_store_sd(&accum, accum_sse);
#endif
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
static void
@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
npy_intp *NPY_UNUSED(strides), npy_intp count)
{
@temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
@type@ *data1 = (@type@ *)dataptr[1];
@temptype@ accum = 0;
#if EINSUM_USE_SSE1 && @float32@
__m128 a, accum_sse = _mm_setzero_ps();
#endif
NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n",
(int)count);
/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
switch (count) {
/**begin repeat2
* #i = 6, 5, 4, 3, 2, 1, 0#
*/
case @i@+1:
accum += @from@(data1[@i@]);
/**end repeat2**/
case 0:
*(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
return;
}
#if EINSUM_USE_SSE1 && @float32@
/* Use aligned instructions if possible */
if (EINSUM_IS_SSE_ALIGNED(data1)) {
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
/**begin repeat2
* #i = 0, 4#
*/
/*
* NOTE: This accumulation changes the order, so will likely
* produce slightly different results.
*/
accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@));
/**end repeat2**/
data1 += 8;
}
#if EINSUM_USE_SSE1 && @float32@
/* Add the four SSE values and put in accum */
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
accum_sse = _mm_add_ps(a, accum_sse);
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
accum_sse = _mm_add_ps(a, accum_sse);
_mm_store_ss(&accum, accum_sse);
#endif
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
#endif
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
#if EINSUM_USE_SSE1 && @float32@
/**begin repeat2
* #i = 0, 4#
*/
/*
* NOTE: This accumulation changes the order, so will likely
* produce slightly different results.
*/
accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@));
/**end repeat2**/
#else
/**begin repeat2
* #i = 0, 1, 2, 3, 4, 5, 6, 7#
*/
accum += @from@(data1[@i@]);
/**end repeat2**/
#endif
data1 += 8;
}
#if EINSUM_USE_SSE1 && @float32@
/* Add the four SSE values and put in accum */
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
accum_sse = _mm_add_ps(a, accum_sse);
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
accum_sse = _mm_add_ps(a, accum_sse);
_mm_store_ss(&accum, accum_sse);
#endif
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
static void
@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
npy_intp *NPY_UNUSED(strides), npy_intp count)
{
@type@ *data0 = (@type@ *)dataptr[0];
@temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
@temptype@ accum = 0;
#if EINSUM_USE_SSE1 && @float32@
__m128 a, accum_sse = _mm_setzero_ps();
#endif
NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n",
(int)count);
/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
switch (count) {
/**begin repeat2
* #i = 6, 5, 4, 3, 2, 1, 0#
*/
case @i@+1:
accum += @from@(data0[@i@]);
/**end repeat2**/
case 0:
*(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1);
return;
}
#if EINSUM_USE_SSE1 && @float32@
/* Use aligned instructions if possible */
if (EINSUM_IS_SSE_ALIGNED(data0)) {
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
/**begin repeat2
* #i = 0, 4#
*/
/*
* NOTE: This accumulation changes the order, so will likely
* produce slightly different results.
*/
accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
/**end repeat2**/
data0 += 8;
}
#if EINSUM_USE_SSE1 && @float32@
/* Add the four SSE values and put in accum */
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
accum_sse = _mm_add_ps(a, accum_sse);
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
accum_sse = _mm_add_ps(a, accum_sse);
_mm_store_ss(&accum, accum_sse);
#endif
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
#endif
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
#if EINSUM_USE_SSE1 && @float32@
/**begin repeat2
* #i = 0, 4#
*/
/*
* NOTE: This accumulation changes the order, so will likely
* produce slightly different results.
*/
accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
/**end repeat2**/
#else
/**begin repeat2
* #i = 0, 1, 2, 3, 4, 5, 6, 7#
*/
accum += @from@(data0[@i@]);
/**end repeat2**/
#endif
data0 += 8;
}
#if EINSUM_USE_SSE1 && @float32@
/* Add the four SSE values and put in accum */
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
accum_sse = _mm_add_ps(a, accum_sse);
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
accum_sse = _mm_add_ps(a, accum_sse);
_mm_store_ss(&accum, accum_sse);
#endif
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
#elif @nop@ == 3 && !@complex@
static void
@name@_sum_of_products_contig_three(int nop, char **dataptr,
npy_intp *NPY_UNUSED(strides), npy_intp count)
{
@type@ *data0 = (@type@ *)dataptr[0];
@type@ *data1 = (@type@ *)dataptr[1];
@type@ *data2 = (@type@ *)dataptr[2];
@type@ *data_out = (@type@ *)dataptr[3];
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
/**begin repeat2
* #i = 0, 1, 2, 3, 4, 5, 6, 7#
*/
data_out[@i@] = @to@(@from@(data0[@i@]) *
@from@(data1[@i@]) *
@from@(data2[@i@]) +
@from@(data_out[@i@]));
/**end repeat2**/
data0 += 8;
data1 += 8;
data2 += 8;
data_out += 8;
}
/* Finish off the loop */
/**begin repeat2
* #i = 0, 1, 2, 3, 4, 5, 6, 7#
*/
if (count-- == 0) {
return;
}
data_out[@i@] = @to@(@from@(data0[@i@]) *
@from@(data1[@i@]) *
@from@(data2[@i@]) +
@from@(data_out[@i@]));
/**end repeat2**/
}
#else /* @nop@ > 3 || @complex */
static void
@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
npy_intp *NPY_UNUSED(strides), npy_intp count)
{
NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_@noplabel@ (%d)\n",
(int)count);
while (count--) {
#if !@complex@
@temptype@ temp = @from@(*(@type@ *)dataptr[0]);
int i;
for (i = 1; i < nop; ++i) {
temp *= @from@(*(@type@ *)dataptr[i]);
}
*(@type@ *)dataptr[nop] = @to@(temp +
@from@(*(@type@ *)dataptr[i]));
for (i = 0; i <= nop; ++i) {
dataptr[i] += sizeof(@type@);
}
#else /* complex */
# if @nop@ <= 3
# define _SUMPROD_NOP @nop@
# else
# define _SUMPROD_NOP nop
# endif
@temptype@ re, im, tmp;
int i;
re = ((@temptype@ *)dataptr[0])[0];
im = ((@temptype@ *)dataptr[0])[1];
for (i = 1; i < _SUMPROD_NOP; ++i) {
tmp = re * ((@temptype@ *)dataptr[i])[0] -
im * ((@temptype@ *)dataptr[i])[1];
im = re * ((@temptype@ *)dataptr[i])[1] +
im * ((@temptype@ *)dataptr[i])[0];
re = tmp;
}
((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
for (i = 0; i <= _SUMPROD_NOP; ++i) {
dataptr[i] += sizeof(@type@);
}
# undef _SUMPROD_NOP
#endif
}
}
#endif /* functions for various @nop@ */
#if @nop@ == 1
static void
@name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
npy_intp *strides, npy_intp count)
{
#if @complex@
@temptype@ accum_re = 0, accum_im = 0;
@temptype@ *data0 = (@temptype@ *)dataptr[0];
#else
@temptype@ accum = 0;
@type@ *data0 = (@type@ *)dataptr[0];
#endif
#if EINSUM_USE_SSE1 && @float32@
__m128 a, accum_sse = _mm_setzero_ps();
#elif EINSUM_USE_SSE2 && @float64@
__m128d a, accum_sse = _mm_setzero_pd();
#endif
NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n",
(int)count);
/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
switch (count) {
/**begin repeat2
* #i = 6, 5, 4, 3, 2, 1, 0#
*/
case @i@+1:
#if !@complex@
accum += @from@(data0[@i@]);
#else /* complex */
accum_re += data0[2*@i@+0];
accum_im += data0[2*@i@+1];
#endif
/**end repeat2**/
case 0:
#if @complex@
((@temptype@ *)dataptr[1])[0] += accum_re;
((@temptype@ *)dataptr[1])[1] += accum_im;
#else
*((@type@ *)dataptr[1]) = @to@(accum +
@from@(*((@type@ *)dataptr[1])));
#endif
return;
}
#if EINSUM_USE_SSE1 && @float32@
/* Use aligned instructions if possible */
if (EINSUM_IS_SSE_ALIGNED(data0)) {
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
_mm_prefetch(data0 + 512, _MM_HINT_T0);
/**begin repeat2
* #i = 0, 4#
*/
/*
* NOTE: This accumulation changes the order, so will likely
* produce slightly different results.
*/
accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
/**end repeat2**/
data0 += 8;
}
/* Add the four SSE values and put in accum */
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
accum_sse = _mm_add_ps(a, accum_sse);
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
accum_sse = _mm_add_ps(a, accum_sse);
_mm_store_ss(&accum, accum_sse);
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
#elif EINSUM_USE_SSE2 && @float64@
/* Use aligned instructions if possible */
if (EINSUM_IS_SSE_ALIGNED(data0)) {
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
_mm_prefetch(data0 + 512, _MM_HINT_T0);
/**begin repeat2
* #i = 0, 2, 4, 6#
*/
/*
* NOTE: This accumulation changes the order, so will likely
* produce slightly different results.
*/
accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
/**end repeat2**/
data0 += 8;
}
/* Add the two SSE2 values and put in accum */
a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
accum_sse = _mm_add_pd(a, accum_sse);
_mm_store_sd(&accum, accum_sse);
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
#endif
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
#if EINSUM_USE_SSE1 && @float32@
_mm_prefetch(data0 + 512, _MM_HINT_T0);
/**begin repeat2
* #i = 0, 4#
*/
/*
* NOTE: This accumulation changes the order, so will likely
* produce slightly different results.
*/
accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
/**end repeat2**/
#elif EINSUM_USE_SSE2 && @float64@
_mm_prefetch(data0 + 512, _MM_HINT_T0);
/**begin repeat2
* #i = 0, 2, 4, 6#
*/
/*
* NOTE: This accumulation changes the order, so will likely
* produce slightly different results.
*/
accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
/**end repeat2**/
#else
/**begin repeat2
* #i = 0, 1, 2, 3, 4, 5, 6, 7#
*/
# if !@complex@
accum += @from@(data0[@i@]);
# else /* complex */
accum_re += data0[2*@i@+0];
accum_im += data0[2*@i@+1];
# endif
/**end repeat2**/
#endif
#if !@complex@
data0 += 8;
#else
data0 += 8*2;
#endif
}
#if EINSUM_USE_SSE1 && @float32@
/* Add the four SSE values and put in accum */
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
accum_sse = _mm_add_ps(a, accum_sse);
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
accum_sse = _mm_add_ps(a, accum_sse);
_mm_store_ss(&accum, accum_sse);
#elif EINSUM_USE_SSE2 && @float64@
/* Add the two SSE2 values and put in accum */
a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
accum_sse = _mm_add_pd(a, accum_sse);
_mm_store_sd(&accum, accum_sse);
#endif
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
#endif /* @nop@ == 1 */
static void
@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
npy_intp *strides, npy_intp count)
{
#if @complex@
@temptype@ accum_re = 0, accum_im = 0;
#else
@temptype@ accum = 0;
#endif
#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
char *data0 = dataptr[0];
npy_intp stride0 = strides[0];
#endif
#if (@nop@ == 2 || @nop@ == 3) && !@complex@
char *data1 = dataptr[1];
npy_intp stride1 = strides[1];
#endif
#if (@nop@ == 3) && !@complex@
char *data2 = dataptr[2];
npy_intp stride2 = strides[2];
#endif
NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_outstride0_@noplabel@ (%d)\n",
(int)count);
while (count--) {
#if !@complex@
# if @nop@ == 1
accum += @from@(*(@type@ *)data0);
data0 += stride0;
# elif @nop@ == 2
accum += @from@(*(@type@ *)data0) *
@from@(*(@type@ *)data1);
data0 += stride0;
data1 += stride1;
# elif @nop@ == 3
accum += @from@(*(@type@ *)data0) *
@from@(*(@type@ *)data1) *
@from@(*(@type@ *)data2);
data0 += stride0;
data1 += stride1;
data2 += stride2;
# else
@temptype@ temp = @from@(*(@type@ *)dataptr[0]);
int i;
for (i = 1; i < nop; ++i) {
temp *= @from@(*(@type@ *)dataptr[i]);
}
accum += temp;
for (i = 0; i < nop; ++i) {
dataptr[i] += strides[i];
}
# endif
#else /* complex */
# if @nop@ == 1
accum_re += ((@temptype@ *)data0)[0];
accum_im += ((@temptype@ *)data0)[1];
data0 += stride0;
# else
# if @nop@ <= 3
#define _SUMPROD_NOP @nop@
# else
#define _SUMPROD_NOP nop
# endif
@temptype@ re, im, tmp;
int i;
re = ((@temptype@ *)dataptr[0])[0];
im = ((@temptype@ *)dataptr[0])[1];
for (i = 1; i < _SUMPROD_NOP; ++i) {
tmp = re * ((@temptype@ *)dataptr[i])[0] -
im * ((@temptype@ *)dataptr[i])[1];
im = re * ((@temptype@ *)dataptr[i])[1] +
im * ((@temptype@ *)dataptr[i])[0];
re = tmp;
}
accum_re += re;
accum_im += im;
for (i = 0; i < _SUMPROD_NOP; ++i) {
dataptr[i] += strides[i];
}
#undef _SUMPROD_NOP
# endif
#endif
}
#if @complex@
# if @nop@ <= 3
((@temptype@ *)dataptr[@nop@])[0] += accum_re;
((@temptype@ *)dataptr[@nop@])[1] += accum_im;
# else
((@temptype@ *)dataptr[nop])[0] += accum_re;
((@temptype@ *)dataptr[nop])[1] += accum_im;
# endif
#else
# if @nop@ <= 3
*((@type@ *)dataptr[@nop@]) = @to@(accum +
@from@(*((@type@ *)dataptr[@nop@])));
# else
*((@type@ *)dataptr[nop]) = @to@(accum +
@from@(*((@type@ *)dataptr[nop])));
# endif
#endif
}
/**end repeat1**/
/**end repeat**/
/* Do OR of ANDs for the boolean type */
/**begin repeat
* #nop = 1, 2, 3, 1000#
* #noplabel = one, two, three, any#
*/
static void
bool_sum_of_products_@noplabel@(int nop, char **dataptr,
npy_intp *strides, npy_intp count)
{
#if (@nop@ <= 3)
char *data0 = dataptr[0];
npy_intp stride0 = strides[0];
#endif
#if (@nop@ == 2 || @nop@ == 3)
char *data1 = dataptr[1];
npy_intp stride1 = strides[1];
#endif
#if (@nop@ == 3)
char *data2 = dataptr[2];
npy_intp stride2 = strides[2];
#endif
#if (@nop@ <= 3)
char *data_out = dataptr[@nop@];
npy_intp stride_out = strides[@nop@];
#endif
while (count--) {
#if @nop@ == 1
*(npy_bool *)data_out = *(npy_bool *)data0 ||
*(npy_bool *)data_out;
data0 += stride0;
data_out += stride_out;
#elif @nop@ == 2
*(npy_bool *)data_out = (*(npy_bool *)data0 &&
*(npy_bool *)data1) ||
*(npy_bool *)data_out;
data0 += stride0;
data1 += stride1;
data_out += stride_out;
#elif @nop@ == 3
*(npy_bool *)data_out = (*(npy_bool *)data0 &&
*(npy_bool *)data1 &&
*(npy_bool *)data2) ||
*(npy_bool *)data_out;
data0 += stride0;
data1 += stride1;
data2 += stride2;
data_out += stride_out;
#else
npy_bool temp = *(npy_bool *)dataptr[0];
int i;
for (i = 1; i < nop; ++i) {
temp = temp && *(npy_bool *)dataptr[i];
}
*(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
for (i = 0; i <= nop; ++i) {
dataptr[i] += strides[i];
}
#endif
}
}
static void
bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
npy_intp *strides, npy_intp count)
{
#if (@nop@ <= 3)
char *data0 = dataptr[0];
#endif
#if (@nop@ == 2 || @nop@ == 3)
char *data1 = dataptr[1];
#endif
#if (@nop@ == 3)
char *data2 = dataptr[2];
#endif
#if (@nop@ <= 3)
char *data_out = dataptr[@nop@];
#endif
#if (@nop@ <= 3)
/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
switch (count) {
/**begin repeat1
* #i = 6, 5, 4, 3, 2, 1, 0#
*/
case @i@+1:
# if @nop@ == 1
((npy_bool *)data_out)[@i@] = ((npy_bool *)data0)[@i@] ||
((npy_bool *)data_out)[@i@];
# elif @nop@ == 2
((npy_bool *)data_out)[@i@] =
(((npy_bool *)data0)[@i@] &&
((npy_bool *)data1)[@i@]) ||
((npy_bool *)data_out)[@i@];
# elif @nop@ == 3
((npy_bool *)data_out)[@i@] =
(((npy_bool *)data0)[@i@] &&
((npy_bool *)data1)[@i@] &&
((npy_bool *)data2)[@i@]) ||
((npy_bool *)data_out)[@i@];
# endif
/**end repeat1**/
case 0:
return;
}
#endif
/* Unroll the loop by 8 for fixed-size nop */
#if (@nop@ <= 3)
while (count >= 8) {
count -= 8;
#else
while (count--) {
#endif
# if @nop@ == 1
/**begin repeat1
* #i = 0, 1, 2, 3, 4, 5, 6, 7#
*/
*((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) ||
(*((npy_bool *)data_out + @i@));
/**end repeat1**/
data0 += 8*sizeof(npy_bool);
data_out += 8*sizeof(npy_bool);
# elif @nop@ == 2
/**begin repeat1
* #i = 0, 1, 2, 3, 4, 5, 6, 7#
*/
*((npy_bool *)data_out + @i@) =
((*((npy_bool *)data0 + @i@)) &&
(*((npy_bool *)data1 + @i@))) ||
(*((npy_bool *)data_out + @i@));
/**end repeat1**/
data0 += 8*sizeof(npy_bool);
data1 += 8*sizeof(npy_bool);
data_out += 8*sizeof(npy_bool);
# elif @nop@ == 3
/**begin repeat1
* #i = 0, 1, 2, 3, 4, 5, 6, 7#
*/
*((npy_bool *)data_out + @i@) =
((*((npy_bool *)data0 + @i@)) &&
(*((npy_bool *)data1 + @i@)) &&
(*((npy_bool *)data2 + @i@))) ||
(*((npy_bool *)data_out + @i@));
/**end repeat1**/
data0 += 8*sizeof(npy_bool);
data1 += 8*sizeof(npy_bool);
data2 += 8*sizeof(npy_bool);
data_out += 8*sizeof(npy_bool);
# else
npy_bool temp = *(npy_bool *)dataptr[0];
int i;
for (i = 1; i < nop; ++i) {
temp = temp && *(npy_bool *)dataptr[i];
}
*(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
for (i = 0; i <= nop; ++i) {
dataptr[i] += sizeof(npy_bool);
}
# endif
}
/* If the loop was unrolled, we need to finish it off */
#if (@nop@ <= 3)
goto finish_after_unrolled_loop;
#endif
}
static void
bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
npy_intp *strides, npy_intp count)
{
npy_bool accum = 0;
#if (@nop@ <= 3)
char *data0 = dataptr[0];
npy_intp stride0 = strides[0];
#endif
#if (@nop@ == 2 || @nop@ == 3)
char *data1 = dataptr[1];
npy_intp stride1 = strides[1];
#endif
#if (@nop@ == 3)
char *data2 = dataptr[2];
npy_intp stride2 = strides[2];
#endif
while (count--) {
#if @nop@ == 1
accum = *(npy_bool *)data0 || accum;
data0 += stride0;
#elif @nop@ == 2
accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum;
data0 += stride0;
data1 += stride1;
#elif @nop@ == 3
accum = (*(npy_bool *)data0 &&
*(npy_bool *)data1 &&
*(npy_bool *)data2) || accum;
data0 += stride0;
data1 += stride1;
data2 += stride2;
#else
npy_bool temp = *(npy_bool *)dataptr[0];
int i;
for (i = 1; i < nop; ++i) {
temp = temp && *(npy_bool *)dataptr[i];
}
accum = temp || accum;
for (i = 0; i <= nop; ++i) {
dataptr[i] += strides[i];
}
#endif
}
# if @nop@ <= 3
*((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]);
# else
*((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]);
# endif
}
/**end repeat**/
typedef void (*sum_of_products_fn)(int, char **, npy_intp *, npy_intp);
/* These tables need to match up with the type enum */
static sum_of_products_fn
_contig_outstride0_unary_specialization_table[NPY_NTYPES] = {
/**begin repeat
* #name = bool,
* byte, ubyte,
* short, ushort,
* int, uint,
* long, ulong,
* longlong, ulonglong,
* float, double, longdouble,
* cfloat, cdouble, clongdouble,
* object, string, unicode, void,
* datetime, timedelta, half#
* #use = 0,
* 1, 1,
* 1, 1,
* 1, 1,
* 1, 1,
* 1, 1,
* 1, 1, 1,
* 1, 1, 1,
* 0, 0, 0, 0,
* 0, 0, 1#
*/
#if @use@
&@name@_sum_of_products_contig_outstride0_one,
#else
NULL,
#endif
/**end repeat**/
}; /* End of _contig_outstride0_unary_specialization_table */
static sum_of_products_fn _binary_specialization_table[NPY_NTYPES][5] = {
/**begin repeat
* #name = bool,
* byte, ubyte,
* short, ushort,
* int, uint,
* long, ulong,
* longlong, ulonglong,
* float, double, longdouble,
* cfloat, cdouble, clongdouble,
* object, string, unicode, void,
* datetime, timedelta, half#
* #use = 0,
* 1, 1,
* 1, 1,
* 1, 1,
* 1, 1,
* 1, 1,
* 1, 1, 1,
* 0, 0, 0,
* 0, 0, 0, 0,
* 0, 0, 1#
*/
#if @use@
{
&@name@_sum_of_products_stride0_contig_outstride0_two,
&@name@_sum_of_products_stride0_contig_outcontig_two,
&@name@_sum_of_products_contig_stride0_outstride0_two,
&@name@_sum_of_products_contig_stride0_outcontig_two,
&@name@_sum_of_products_contig_contig_outstride0_two,
},
#else
{NULL, NULL, NULL, NULL, NULL},
#endif
/**end repeat**/
}; /* End of _binary_specialization_table */
static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = {
/**begin repeat
* #name = bool,
* byte, ubyte,
* short, ushort,
* int, uint,
* long, ulong,
* longlong, ulonglong,
* float, double, longdouble,
* cfloat, cdouble, clongdouble,
* object, string, unicode, void,
* datetime, timedelta, half#
* #use = 1,
* 1, 1,
* 1, 1,
* 1, 1,
* 1, 1,
* 1, 1,
* 1, 1, 1,
* 1, 1, 1,
* 0, 0, 0, 0,
* 0, 0, 1#
*/
#if @use@
{
&@name@_sum_of_products_outstride0_any,
&@name@_sum_of_products_outstride0_one,
&@name@_sum_of_products_outstride0_two,
&@name@_sum_of_products_outstride0_three
},
#else
{NULL, NULL, NULL, NULL},
#endif
/**end repeat**/
}; /* End of _outstride0_specialized_table */
static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = {
/**begin repeat
* #name = bool,
* byte, ubyte,
* short, ushort,
* int, uint,
* long, ulong,
* longlong, ulonglong,
* float, double, longdouble,
* cfloat, cdouble, clongdouble,
* object, string, unicode, void,
* datetime, timedelta, half#
* #use = 1,
* 1, 1,
* 1, 1,
* 1, 1,
* 1, 1,
* 1, 1,
* 1, 1, 1,
* 1, 1, 1,
* 0, 0, 0, 0,
* 0, 0, 1#
*/
#if @use@
{
&@name@_sum_of_products_contig_any,
&@name@_sum_of_products_contig_one,
&@name@_sum_of_products_contig_two,
&@name@_sum_of_products_contig_three
},
#else
{NULL, NULL, NULL, NULL},
#endif
/**end repeat**/
}; /* End of _allcontig_specialized_table */
static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = {
/**begin repeat
* #name = bool,
* byte, ubyte,
* short, ushort,
* int, uint,
* long, ulong,
* longlong, ulonglong,
* float, double, longdouble,
* cfloat, cdouble, clongdouble,
* object, string, unicode, void,
* datetime, timedelta, half#
* #use = 1,
* 1, 1,
* 1, 1,
* 1, 1,
* 1, 1,
* 1, 1,
* 1, 1, 1,
* 1, 1, 1,
* 0, 0, 0, 0,
* 0, 0, 1#
*/
#if @use@
{
&@name@_sum_of_products_any,
&@name@_sum_of_products_one,
&@name@_sum_of_products_two,
&@name@_sum_of_products_three
},
#else
{NULL, NULL, NULL, NULL},
#endif
/**end repeat**/
}; /* End of _unnspecialized_table */
static sum_of_products_fn
get_sum_of_products_function(int nop, int type_num,
npy_intp itemsize, npy_intp *fixed_strides)
{
int iop;
if (type_num >= NPY_NTYPES) {
return NULL;
}
/* contiguous reduction */
if (nop == 1 && fixed_strides[0] == itemsize && fixed_strides[1] == 0) {
sum_of_products_fn ret =
_contig_outstride0_unary_specialization_table[type_num];
if (ret != NULL) {
return ret;
}
}
/* nop of 2 has more specializations */
if (nop == 2) {
/* Encode the zero/contiguous strides */
int code;
code = (fixed_strides[0] == 0) ? 0 :
(fixed_strides[0] == itemsize) ? 2*2*1 : 8;
code += (fixed_strides[1] == 0) ? 0 :
(fixed_strides[1] == itemsize) ? 2*1 : 8;
code += (fixed_strides[2] == 0) ? 0 :
(fixed_strides[2] == itemsize) ? 1 : 8;
if (code >= 2 && code < 7) {
sum_of_products_fn ret =
_binary_specialization_table[type_num][code-2];
if (ret != NULL) {
return ret;
}
}
}
/* Inner loop with an output stride of 0 */
if (fixed_strides[nop] == 0) {
return _outstride0_specialized_table[type_num][nop <= 3 ? nop : 0];
}
/* Check for all contiguous */
for (iop = 0; iop < nop + 1; ++iop) {
if (fixed_strides[iop] != itemsize) {
break;
}
}
/* Contiguous loop */
if (iop == nop + 1) {
return _allcontig_specialized_table[type_num][nop <= 3 ? nop : 0];
}
/* None of the above specializations caught it, general loops */
return _unspecialized_table[type_num][nop <= 3 ? nop : 0];
}
/*
* Parses the subscripts for one operand into an output of 'ndim'
* labels. The resulting 'op_labels' array will have:
* - the ASCII code of the label for the first occurrence of a label;
* - the (negative) offset to the first occurrence of the label for
* repeated labels;
* - zero for broadcast dimensions, if subscripts has an ellipsis.
* For example:
* - subscripts="abbcbc", ndim=6 -> op_labels=[97, 98, -1, 99, -3, -2]
* - subscripts="ab...bc", ndim=6 -> op_labels=[97, 98, 0, 0, -3, 99]
*/
static int
parse_operand_subscripts(char *subscripts, int length,
int ndim, int iop, char *op_labels,
char *label_counts, int *min_label, int *max_label)
{
int i;
int idim = 0;
int ellipsis = -1;
/* Process all labels for this operand */
for (i = 0; i < length; ++i) {
int label = subscripts[i];
/* A proper label for an axis. */
if (label > 0 && isalpha(label)) {
/* Check we don't exceed the operator dimensions. */
if (idim >= ndim) {
PyErr_Format(PyExc_ValueError,
"einstein sum subscripts string contains "
"too many subscripts for operand %d", iop);
return -1;
}
op_labels[idim++] = label;
if (label < *min_label) {
*min_label = label;
}
if (label > *max_label) {
*max_label = label;
}
label_counts[label]++;
}
/* The beginning of the ellipsis. */
else if (label == '.') {
/* Check it's a proper ellipsis. */
if (ellipsis != -1 || i + 2 >= length
|| subscripts[++i] != '.' || subscripts[++i] != '.') {
PyErr_Format(PyExc_ValueError,
"einstein sum subscripts string contains a "
"'.' that is not part of an ellipsis ('...') "
"in operand %d", iop);
return -1;
}
ellipsis = idim;
}
else if (label != ' ') {
PyErr_Format(PyExc_ValueError,
"invalid subscript '%c' in einstein sum "
"subscripts string, subscripts must "
"be letters", (char)label);
return -1;
}
}
/* No ellipsis found, labels must match dimensions exactly. */
if (ellipsis == -1) {
if (idim != ndim) {
PyErr_Format(PyExc_ValueError,
"operand has more dimensions than subscripts "
"given in einstein sum, but no '...' ellipsis "
"provided to broadcast the extra dimensions.");
return -1;
}
}
/* Ellipsis found, may have to add broadcast dimensions. */
else if (idim < ndim) {
/* Move labels after ellipsis to the end. */
for (i = 0; i < idim - ellipsis; ++i) {
op_labels[ndim - i - 1] = op_labels[idim - i - 1];
}
/* Set all broadcast dimensions to zero. */
for (i = 0; i < ndim - idim; ++i) {
op_labels[ellipsis + i] = 0;
}
}
/*
* Find any labels duplicated for this operand, and turn them
* into negative offsets to the axis to merge with.
*
* In C, the char type may be signed or unsigned, but with
* twos complement arithmetic the char is ok either way here, and
* later where it matters the char is cast to a signed char.
*/
for (idim = 0; idim < ndim - 1; ++idim) {
int label = op_labels[idim];
/* If it is a proper label, find any duplicates of it. */
if (label > 0) {
/* Search for the next matching label. */
char *next = memchr(op_labels + idim + 1, label, ndim - idim - 1);
while (next != NULL) {
/* The offset from next to op_labels[idim] (negative). */
*next = (char)((op_labels + idim) - next);
/* Search for the next matching label. */
next = memchr(next + 1, label, op_labels + ndim - 1 - next);
}
}
}
return 0;
}
/*
* Parses the subscripts for the output operand into an output that
* includes 'ndim_broadcast' unlabeled dimensions, and returns the total
* number of output dimensions, or -1 if there is an error. Similarly
* to parse_operand_subscripts, the 'out_labels' array will have, for
* each dimension:
* - the ASCII code of the corresponding label;
* - zero for broadcast dimensions, if subscripts has an ellipsis.
*/
static int
parse_output_subscripts(char *subscripts, int length,
int ndim_broadcast,
const char *label_counts, char *out_labels)
{
int i, bdim;
int ndim = 0;
int ellipsis = 0;
/* Process all the output labels. */
for (i = 0; i < length; ++i) {
int label = subscripts[i];
/* A proper label for an axis. */
if (label > 0 && isalpha(label)) {
/* Check that it doesn't occur again. */
if (memchr(subscripts + i + 1, label, length - i - 1) != NULL) {
PyErr_Format(PyExc_ValueError,
"einstein sum subscripts string includes "
"output subscript '%c' multiple times",
(char)label);
return -1;
}
/* Check that it was used in the inputs. */
if (label_counts[label] == 0) {
PyErr_Format(PyExc_ValueError,
"einstein sum subscripts string included "
"output subscript '%c' which never appeared "
"in an input", (char)label);
return -1;
}
/* Check that there is room in out_labels for this label. */
if (ndim >= NPY_MAXDIMS) {
PyErr_Format(PyExc_ValueError,
"einstein sum subscripts string contains "
"too many subscripts in the output");
return -1;
}
out_labels[ndim++] = label;
}
/* The beginning of the ellipsis. */
else if (label == '.') {
/* Check it is a proper ellipsis. */
if (ellipsis || i + 2 >= length
|| subscripts[++i] != '.' || subscripts[++i] != '.') {
PyErr_SetString(PyExc_ValueError,
"einstein sum subscripts string "
"contains a '.' that is not part of "
"an ellipsis ('...') in the output");
return -1;
}
/* Check there is room in out_labels for broadcast dims. */
if (ndim + ndim_broadcast > NPY_MAXDIMS) {
PyErr_Format(PyExc_ValueError,
"einstein sum subscripts string contains "
"too many subscripts in the output");
return -1;
}
ellipsis = 1;
for (bdim = 0; bdim < ndim_broadcast; ++bdim) {
out_labels[ndim++] = 0;
}
}
else if (label != ' ') {
PyErr_Format(PyExc_ValueError,
"invalid subscript '%c' in einstein sum "
"subscripts string, subscripts must "
"be letters", (char)label);
return -1;
}
}
/* If no ellipsis was found there should be no broadcast dimensions. */
if (!ellipsis && ndim_broadcast > 0) {
PyErr_SetString(PyExc_ValueError,
"output has more dimensions than subscripts "
"given in einstein sum, but no '...' ellipsis "
"provided to broadcast the extra dimensions.");
return -1;
}
return ndim;
}
/*
* When there's just one operand and no reduction, we
* can return a view into op. This calculates the view
* if possible.
*/
static int
get_single_op_view(PyArrayObject *op, int iop, char *labels,
int ndim_output, char *output_labels,
PyArrayObject **ret)
{
npy_intp new_strides[NPY_MAXDIMS];
npy_intp new_dims[NPY_MAXDIMS];
char *out_label;
int label, i, idim, ndim, ibroadcast = 0;
ndim = PyArray_NDIM(op);
/* Initialize the dimensions and strides to zero */
for (idim = 0; idim < ndim_output; ++idim) {
new_dims[idim] = 0;
new_strides[idim] = 0;
}
/* Match the labels in the operand with the output labels */
for (idim = 0; idim < ndim; ++idim) {
/*
* The char type may be either signed or unsigned, we
* need it to be signed here.
*/
label = (signed char)labels[idim];
/* If this label says to merge axes, get the actual label */
if (label < 0) {
label = labels[idim+label];
}
/* If the label is 0, it's an unlabeled broadcast dimension */
if (label == 0) {
/* The next output label that's a broadcast dimension */
for (; ibroadcast < ndim_output; ++ibroadcast) {
if (output_labels[ibroadcast] == 0) {
break;
}
}
if (ibroadcast == ndim_output) {
PyErr_SetString(PyExc_ValueError,
"output had too few broadcast dimensions");
return -1;
}
new_dims[ibroadcast] = PyArray_DIM(op, idim);
new_strides[ibroadcast] = PyArray_STRIDE(op, idim);
++ibroadcast;
}
else {
/* Find the position for this dimension in the output */
out_label = (char *)memchr(output_labels, label,
ndim_output);
/* If it's not found, reduction -> can't return a view */
if (out_label == NULL) {
break;
}
/* Update the dimensions and strides of the output */
i = out_label - output_labels;
if (new_dims[i] != 0 &&
new_dims[i] != PyArray_DIM(op, idim)) {
PyErr_Format(PyExc_ValueError,
"dimensions in operand %d for collapsing "
"index '%c' don't match (%d != %d)",
iop, label, (int)new_dims[i],
(int)PyArray_DIM(op, idim));
return -1;
}
new_dims[i] = PyArray_DIM(op, idim);
new_strides[i] += PyArray_STRIDE(op, idim);
}
}
/* If we processed all the input axes, return a view */
if (idim == ndim) {
Py_INCREF(PyArray_DESCR(op));
*ret = (PyArrayObject *)PyArray_NewFromDescr_int(
Py_TYPE(op), PyArray_DESCR(op),
ndim_output, new_dims, new_strides, PyArray_DATA(op),
PyArray_ISWRITEABLE(op) ? NPY_ARRAY_WRITEABLE : 0,
(PyObject *)op, (PyObject *)op,
0, 0);
if (*ret == NULL) {
return -1;
}
return 0;
}
/* Return success, but that we couldn't make a view */
*ret = NULL;
return 0;
}
static PyArrayObject *
get_combined_dims_view(PyArrayObject *op, int iop, char *labels)
{
npy_intp new_strides[NPY_MAXDIMS];
npy_intp new_dims[NPY_MAXDIMS];
int idim, ndim, icombine, combineoffset;
int icombinemap[NPY_MAXDIMS];
PyArrayObject *ret = NULL;
ndim = PyArray_NDIM(op);
/* Initialize the dimensions and strides to zero */
for (idim = 0; idim < ndim; ++idim) {
new_dims[idim] = 0;
new_strides[idim] = 0;
}
/* Copy the dimensions and strides, except when collapsing */
icombine = 0;
for (idim = 0; idim < ndim; ++idim) {
/*
* The char type may be either signed or unsigned, we
* need it to be signed here.
*/
int label = (signed char)labels[idim];
/* If this label says to merge axes, get the actual label */
if (label < 0) {
combineoffset = label;
label = labels[idim+label];
}
else {
combineoffset = 0;
if (icombine != idim) {
labels[icombine] = labels[idim];
}
icombinemap[idim] = icombine;
}
/* If the label is 0, it's an unlabeled broadcast dimension */
if (label == 0) {
new_dims[icombine] = PyArray_DIM(op, idim);
new_strides[icombine] = PyArray_STRIDE(op, idim);
}
else {
/* Update the combined axis dimensions and strides */
int i = icombinemap[idim + combineoffset];
if (combineoffset < 0 && new_dims[i] != 0 &&
new_dims[i] != PyArray_DIM(op, idim)) {
PyErr_Format(PyExc_ValueError,
"dimensions in operand %d for collapsing "
"index '%c' don't match (%d != %d)",
iop, label, (int)new_dims[i],
(int)PyArray_DIM(op, idim));
return NULL;
}
new_dims[i] = PyArray_DIM(op, idim);
new_strides[i] += PyArray_STRIDE(op, idim);
}
/* If the label didn't say to combine axes, increment dest i */
if (combineoffset == 0) {
icombine++;
}
}
/* The compressed number of dimensions */
ndim = icombine;
Py_INCREF(PyArray_DESCR(op));
ret = (PyArrayObject *)PyArray_NewFromDescrAndBase(
Py_TYPE(op), PyArray_DESCR(op),
ndim, new_dims, new_strides, PyArray_DATA(op),
PyArray_ISWRITEABLE(op) ? NPY_ARRAY_WRITEABLE : 0,
(PyObject *)op, (PyObject *)op);
return ret;
}
static int
prepare_op_axes(int ndim, int iop, char *labels, int *axes,
int ndim_iter, char *iter_labels)
{
int i, label, ibroadcast;
ibroadcast = ndim-1;
for (i = ndim_iter-1; i >= 0; --i) {
label = iter_labels[i];
/*
* If it's an unlabeled broadcast dimension, choose
* the next broadcast dimension from the operand.
*/
if (label == 0) {
while (ibroadcast >= 0 && labels[ibroadcast] != 0) {
--ibroadcast;
}
/*
* If we used up all the operand broadcast dimensions,
* extend it with a "newaxis"
*/
if (ibroadcast < 0) {
axes[i] = -1;
}
/* Otherwise map to the broadcast axis */
else {
axes[i] = ibroadcast;
--ibroadcast;
}
}
/* It's a labeled dimension, find the matching one */
else {
char *match = memchr(labels, label, ndim);
/* If the op doesn't have the label, broadcast it */
if (match == NULL) {
axes[i] = -1;
}
/* Otherwise use it */
else {
axes[i] = match - labels;
}
}
}
return 0;
}
static int
unbuffered_loop_nop1_ndim2(NpyIter *iter)
{
npy_intp coord, shape[2], strides[2][2];
char *ptrs[2][2], *ptr;
sum_of_products_fn sop;
NPY_BEGIN_THREADS_DEF;
#if NPY_EINSUM_DBG_TRACING
NpyIter_DebugPrint(iter);
#endif
NPY_EINSUM_DBG_PRINT("running hand-coded 1-op 2-dim loop\n");
NpyIter_GetShape(iter, shape);
memcpy(strides[0], NpyIter_GetAxisStrideArray(iter, 0),
2*sizeof(npy_intp));
memcpy(strides[1], NpyIter_GetAxisStrideArray(iter, 1),
2*sizeof(npy_intp));
memcpy(ptrs[0], NpyIter_GetInitialDataPtrArray(iter),
2*sizeof(char *));
memcpy(ptrs[1], ptrs[0], 2*sizeof(char*));
sop = get_sum_of_products_function(1,
NpyIter_GetDescrArray(iter)[0]->type_num,
NpyIter_GetDescrArray(iter)[0]->elsize,
strides[0]);
if (sop == NULL) {
PyErr_SetString(PyExc_TypeError,
"invalid data type for einsum");
return -1;
}
/*
* Since the iterator wasn't tracking coordinates, the
* loop provided by the iterator is in Fortran-order.
*/
NPY_BEGIN_THREADS_THRESHOLDED(shape[1] * shape[0]);
for (coord = shape[1]; coord > 0; --coord) {
sop(1, ptrs[0], strides[0], shape[0]);
ptr = ptrs[1][0] + strides[1][0];
ptrs[0][0] = ptrs[1][0] = ptr;
ptr = ptrs[1][1] + strides[1][1];
ptrs[0][1] = ptrs[1][1] = ptr;
}
NPY_END_THREADS;
return 0;
}
static int
unbuffered_loop_nop1_ndim3(NpyIter *iter)
{
npy_intp coords[2], shape[3], strides[3][2];
char *ptrs[3][2], *ptr;
sum_of_products_fn sop;
NPY_BEGIN_THREADS_DEF;
#if NPY_EINSUM_DBG_TRACING
NpyIter_DebugPrint(iter);
#endif
NPY_EINSUM_DBG_PRINT("running hand-coded 1-op 3-dim loop\n");
NpyIter_GetShape(iter, shape);
memcpy(strides[0], NpyIter_GetAxisStrideArray(iter, 0),
2*sizeof(npy_intp));
memcpy(strides[1], NpyIter_GetAxisStrideArray(iter, 1),
2*sizeof(npy_intp));
memcpy(strides[2], NpyIter_GetAxisStrideArray(iter, 2),
2*sizeof(npy_intp));
memcpy(ptrs[0], NpyIter_GetInitialDataPtrArray(iter),
2*sizeof(char *));
memcpy(ptrs[1], ptrs[0], 2*sizeof(char*));
memcpy(ptrs[2], ptrs[0], 2*sizeof(char*));
sop = get_sum_of_products_function(1,
NpyIter_GetDescrArray(iter)[0]->type_num,
NpyIter_GetDescrArray(iter)[0]->elsize,
strides[0]);
if (sop == NULL) {
PyErr_SetString(PyExc_TypeError,
"invalid data type for einsum");
return -1;
}
/*
* Since the iterator wasn't tracking coordinates, the
* loop provided by the iterator is in Fortran-order.
*/
NPY_BEGIN_THREADS_THRESHOLDED(shape[2] * shape[1] * shape[0]);
for (coords[1] = shape[2]; coords[1] > 0; --coords[1]) {
for (coords[0] = shape[1]; coords[0] > 0; --coords[0]) {
sop(1, ptrs[0], strides[0], shape[0]);
ptr = ptrs[1][0] + strides[1][0];
ptrs[0][0] = ptrs[1][0] = ptr;
ptr = ptrs[1][1] + strides[1][1];
ptrs[0][1] = ptrs[1][1] = ptr;
}
ptr = ptrs[2][0] + strides[2][0];
ptrs[0][0] = ptrs[1][0] = ptrs[2][0] = ptr;
ptr = ptrs[2][1] + strides[2][1];
ptrs[0][1] = ptrs[1][1] = ptrs[2][1] = ptr;
}
NPY_END_THREADS;
return 0;
}
static int
unbuffered_loop_nop2_ndim2(NpyIter *iter)
{
npy_intp coord, shape[2], strides[2][3];
char *ptrs[2][3], *ptr;
sum_of_products_fn sop;
NPY_BEGIN_THREADS_DEF;
#if NPY_EINSUM_DBG_TRACING
NpyIter_DebugPrint(iter);
#endif
NPY_EINSUM_DBG_PRINT("running hand-coded 2-op 2-dim loop\n");
NpyIter_GetShape(iter, shape);
memcpy(strides[0], NpyIter_GetAxisStrideArray(iter, 0),
3*sizeof(npy_intp));
memcpy(strides[1], NpyIter_GetAxisStrideArray(iter, 1),
3*sizeof(npy_intp));
memcpy(ptrs[0], NpyIter_GetInitialDataPtrArray(iter),
3*sizeof(char *));
memcpy(ptrs[1], ptrs[0], 3*sizeof(char*));
sop = get_sum_of_products_function(2,
NpyIter_GetDescrArray(iter)[0]->type_num,
NpyIter_GetDescrArray(iter)[0]->elsize,
strides[0]);
if (sop == NULL) {
PyErr_SetString(PyExc_TypeError,
"invalid data type for einsum");
return -1;
}
/*
* Since the iterator wasn't tracking coordinates, the
* loop provided by the iterator is in Fortran-order.
*/
NPY_BEGIN_THREADS_THRESHOLDED(shape[1] * shape[0]);
for (coord = shape[1]; coord > 0; --coord) {
sop(2, ptrs[0], strides[0], shape[0]);
ptr = ptrs[1][0] + strides[1][0];
ptrs[0][0] = ptrs[1][0] = ptr;
ptr = ptrs[1][1] + strides[1][1];
ptrs[0][1] = ptrs[1][1] = ptr;
ptr = ptrs[1][2] + strides[1][2];
ptrs[0][2] = ptrs[1][2] = ptr;
}
NPY_END_THREADS;
return 0;
}
static int
unbuffered_loop_nop2_ndim3(NpyIter *iter)
{
npy_intp coords[2], shape[3], strides[3][3];
char *ptrs[3][3], *ptr;
sum_of_products_fn sop;
NPY_BEGIN_THREADS_DEF;
#if NPY_EINSUM_DBG_TRACING
NpyIter_DebugPrint(iter);
#endif
NPY_EINSUM_DBG_PRINT("running hand-coded 2-op 3-dim loop\n");
NpyIter_GetShape(iter, shape);
memcpy(strides[0], NpyIter_GetAxisStrideArray(iter, 0),
3*sizeof(npy_intp));
memcpy(strides[1], NpyIter_GetAxisStrideArray(iter, 1),
3*sizeof(npy_intp));
memcpy(strides[2], NpyIter_GetAxisStrideArray(iter, 2),
3*sizeof(npy_intp));
memcpy(ptrs[0], NpyIter_GetInitialDataPtrArray(iter),
3*sizeof(char *));
memcpy(ptrs[1], ptrs[0], 3*sizeof(char*));
memcpy(ptrs[2], ptrs[0], 3*sizeof(char*));
sop = get_sum_of_products_function(2,
NpyIter_GetDescrArray(iter)[0]->type_num,
NpyIter_GetDescrArray(iter)[0]->elsize,
strides[0]);
if (sop == NULL) {
PyErr_SetString(PyExc_TypeError,
"invalid data type for einsum");
return -1;
}
/*
* Since the iterator wasn't tracking coordinates, the
* loop provided by the iterator is in Fortran-order.
*/
NPY_BEGIN_THREADS_THRESHOLDED(shape[2] * shape[1] * shape[0]);
for (coords[1] = shape[2]; coords[1] > 0; --coords[1]) {
for (coords[0] = shape[1]; coords[0] > 0; --coords[0]) {
sop(2, ptrs[0], strides[0], shape[0]);
ptr = ptrs[1][0] + strides[1][0];
ptrs[0][0] = ptrs[1][0] = ptr;
ptr = ptrs[1][1] + strides[1][1];
ptrs[0][1] = ptrs[1][1] = ptr;
ptr = ptrs[1][2] + strides[1][2];
ptrs[0][2] = ptrs[1][2] = ptr;
}
ptr = ptrs[2][0] + strides[2][0];
ptrs[0][0] = ptrs[1][0] = ptrs[2][0] = ptr;
ptr = ptrs[2][1] + strides[2][1];
ptrs[0][1] = ptrs[1][1] = ptrs[2][1] = ptr;
ptr = ptrs[2][2] + strides[2][2];
ptrs[0][2] = ptrs[1][2] = ptrs[2][2] = ptr;
}
NPY_END_THREADS;
return 0;
}
/*NUMPY_API
* This function provides summation of array elements according to
* the Einstein summation convention. For example:
* - trace(a) -> einsum("ii", a)
* - transpose(a) -> einsum("ji", a)
* - multiply(a,b) -> einsum(",", a, b)
* - inner(a,b) -> einsum("i,i", a, b)
* - outer(a,b) -> einsum("i,j", a, b)
* - matvec(a,b) -> einsum("ij,j", a, b)
* - matmat(a,b) -> einsum("ij,jk", a, b)
*
* subscripts: The string of subscripts for einstein summation.
* nop: The number of operands
* op_in: The array of operands
* dtype: Either NULL, or the data type to force the calculation as.
* order: The order for the calculation/the output axes.
* casting: What kind of casts should be permitted.
* out: Either NULL, or an array into which the output should be placed.
*
* By default, the labels get placed in alphabetical order
* at the end of the output. So, if c = einsum("i,j", a, b)
* then c[i,j] == a[i]*b[j], but if c = einsum("j,i", a, b)
* then c[i,j] = a[j]*b[i].
*
* Alternatively, you can control the output order or prevent
* an axis from being summed/force an axis to be summed by providing
* indices for the output. This allows us to turn 'trace' into
* 'diag', for example.
* - diag(a) -> einsum("ii->i", a)
* - sum(a, axis=0) -> einsum("i...->", a)
*
* Subscripts at the beginning and end may be specified by
* putting an ellipsis "..." in the middle. For example,
* the function einsum("i...i", a) takes the diagonal of
* the first and last dimensions of the operand, and
* einsum("ij...,jk...->ik...") takes the matrix product using
* the first two indices of each operand instead of the last two.
*
* When there is only one operand, no axes being summed, and
* no output parameter, this function returns a view
* into the operand instead of making a copy.
*/
NPY_NO_EXPORT PyArrayObject *
PyArray_EinsteinSum(char *subscripts, npy_intp nop,
PyArrayObject **op_in,
PyArray_Descr *dtype,
NPY_ORDER order, NPY_CASTING casting,
PyArrayObject *out)
{
int iop, label, min_label = 127, max_label = 0;
char label_counts[128];
char op_labels[NPY_MAXARGS][NPY_MAXDIMS];
char output_labels[NPY_MAXDIMS], *iter_labels;
int idim, ndim_output, ndim_broadcast, ndim_iter;
PyArrayObject *op[NPY_MAXARGS], *ret = NULL;
PyArray_Descr *op_dtypes_array[NPY_MAXARGS], **op_dtypes;
int op_axes_arrays[NPY_MAXARGS][NPY_MAXDIMS];
int *op_axes[NPY_MAXARGS];
npy_uint32 iter_flags, op_flags[NPY_MAXARGS];
NpyIter *iter;
sum_of_products_fn sop;
npy_intp fixed_strides[NPY_MAXARGS];
/* nop+1 (+1 is for the output) must fit in NPY_MAXARGS */
if (nop >= NPY_MAXARGS) {
PyErr_SetString(PyExc_ValueError,
"too many operands provided to einstein sum function");
return NULL;
}
else if (nop < 1) {
PyErr_SetString(PyExc_ValueError,
"not enough operands provided to einstein sum function");
return NULL;
}
/* Parse the subscripts string into label_counts and op_labels */
memset(label_counts, 0, sizeof(label_counts));
for (iop = 0; iop < nop; ++iop) {
int length = (int)strcspn(subscripts, ",-");
if (iop == nop-1 && subscripts[length] == ',') {
PyErr_SetString(PyExc_ValueError,
"more operands provided to einstein sum function "
"than specified in the subscripts string");
return NULL;
}
else if(iop < nop-1 && subscripts[length] != ',') {
PyErr_SetString(PyExc_ValueError,
"fewer operands provided to einstein sum function "
"than specified in the subscripts string");
return NULL;
}
if (parse_operand_subscripts(subscripts, length,
PyArray_NDIM(op_in[iop]),
iop, op_labels[iop], label_counts,
&min_label, &max_label) < 0) {
return NULL;
}
/* Move subscripts to the start of the labels for the next op */
subscripts += length;
if (iop < nop-1) {
subscripts++;
}
}
/*
* Find the number of broadcast dimensions, which is the maximum
* number of labels == 0 in an op_labels array.
*/
ndim_broadcast = 0;
for (iop = 0; iop < nop; ++iop) {
npy_intp count_zeros = 0;
int ndim;
char *labels = op_labels[iop];
ndim = PyArray_NDIM(op_in[iop]);
for (idim = 0; idim < ndim; ++idim) {
if (labels[idim] == 0) {
++count_zeros;
}
}
if (count_zeros > ndim_broadcast) {
ndim_broadcast = count_zeros;
}
}
/*
* If there is no output signature, fill output_labels and ndim_output
* using each label that appeared once, in alphabetical order.
*/
if (subscripts[0] == '\0') {
/* If no output was specified, always broadcast left, as usual. */
for (ndim_output = 0; ndim_output < ndim_broadcast; ++ndim_output) {
output_labels[ndim_output] = 0;
}
for (label = min_label; label <= max_label; ++label) {
if (label_counts[label] == 1) {
if (ndim_output < NPY_MAXDIMS) {
output_labels[ndim_output++] = label;
}
else {
PyErr_SetString(PyExc_ValueError,
"einstein sum subscript string has too many "
"distinct labels");
return NULL;
}
}
}
}
else {
if (subscripts[0] != '-' || subscripts[1] != '>') {
PyErr_SetString(PyExc_ValueError,
"einstein sum subscript string does not "
"contain proper '->' output specified");
return NULL;
}
subscripts += 2;
/* Parse the output subscript string. */
ndim_output = parse_output_subscripts(subscripts, strlen(subscripts),
ndim_broadcast, label_counts,
output_labels);
if (ndim_output < 0) {
return NULL;
}
}
if (out != NULL && PyArray_NDIM(out) != ndim_output) {
PyErr_Format(PyExc_ValueError,
"out parameter does not have the correct number of "
"dimensions, has %d but should have %d",
(int)PyArray_NDIM(out), (int)ndim_output);
return NULL;
}
/* Set all the op references to NULL */
for (iop = 0; iop < nop; ++iop) {
op[iop] = NULL;
}
/*
* Process all the input ops, combining dimensions into their
* diagonal where specified.
*/
for (iop = 0; iop < nop; ++iop) {
char *labels = op_labels[iop];
int combine, ndim;
ndim = PyArray_NDIM(op_in[iop]);
/*
* If there's just one operand and no output parameter,
* first try remapping the axes to the output to return
* a view instead of a copy.
*/
if (iop == 0 && nop == 1 && out == NULL) {
ret = NULL;
if (get_single_op_view(op_in[iop], iop, labels,
ndim_output, output_labels,
&ret) < 0) {
return NULL;
}
if (ret != NULL) {
return ret;
}
}
/*
* Check whether any dimensions need to be combined
*
* The char type may be either signed or unsigned, we
* need it to be signed here.
*/
combine = 0;
for (idim = 0; idim < ndim; ++idim) {
if ((signed char)labels[idim] < 0) {
combine = 1;
}
}
/* If any dimensions are combined, create a view which combines them */
if (combine) {
op[iop] = get_combined_dims_view(op_in[iop], iop, labels);
if (op[iop] == NULL) {
goto fail;
}
}
/* No combining needed */
else {
Py_INCREF(op_in[iop]);
op[iop] = op_in[iop];
}
}
/* Set the output op */
op[nop] = out;
/*
* Set up the labels for the iterator (output + combined labels).
* Can just share the output_labels memory, because iter_labels
* is output_labels with some more labels appended.
*/
iter_labels = output_labels;
ndim_iter = ndim_output;
for (label = min_label; label <= max_label; ++label) {
if (label_counts[label] > 0 &&
memchr(output_labels, label, ndim_output) == NULL) {
if (ndim_iter >= NPY_MAXDIMS) {
PyErr_SetString(PyExc_ValueError,
"too many subscripts in einsum");
goto fail;
}
iter_labels[ndim_iter++] = label;
}
}
/* Set up the op_axes for the iterator */
for (iop = 0; iop < nop; ++iop) {
op_axes[iop] = op_axes_arrays[iop];
if (prepare_op_axes(PyArray_NDIM(op[iop]), iop, op_labels[iop],
op_axes[iop], ndim_iter, iter_labels) < 0) {
goto fail;
}
}
/* Set up the op_dtypes if dtype was provided */
if (dtype == NULL) {
op_dtypes = NULL;
}
else {
op_dtypes = op_dtypes_array;
for (iop = 0; iop <= nop; ++iop) {
op_dtypes[iop] = dtype;
}
}
/* Set the op_axes for the output */
op_axes[nop] = op_axes_arrays[nop];
for (idim = 0; idim < ndim_output; ++idim) {
op_axes[nop][idim] = idim;
}
for (idim = ndim_output; idim < ndim_iter; ++idim) {
op_axes[nop][idim] = -1;
}
/* Set the iterator per-op flags */
for (iop = 0; iop < nop; ++iop) {
op_flags[iop] = NPY_ITER_READONLY|
NPY_ITER_NBO|
NPY_ITER_ALIGNED;
}
op_flags[nop] = NPY_ITER_READWRITE|
NPY_ITER_NBO|
NPY_ITER_ALIGNED|
NPY_ITER_ALLOCATE|
NPY_ITER_NO_BROADCAST;
iter_flags = NPY_ITER_EXTERNAL_LOOP|
NPY_ITER_BUFFERED|
NPY_ITER_DELAY_BUFALLOC|
NPY_ITER_GROWINNER|
NPY_ITER_REDUCE_OK|
NPY_ITER_REFS_OK|
NPY_ITER_ZEROSIZE_OK;
if (out != NULL) {
iter_flags |= NPY_ITER_COPY_IF_OVERLAP;
}
if (dtype == NULL) {
iter_flags |= NPY_ITER_COMMON_DTYPE;
}
/* Allocate the iterator */
iter = NpyIter_AdvancedNew(nop+1, op, iter_flags, order, casting, op_flags,
op_dtypes, ndim_iter, op_axes, NULL, 0);
if (iter == NULL) {
goto fail;
}
/* Initialize the output to all zeros */
ret = NpyIter_GetOperandArray(iter)[nop];
if (PyArray_AssignZero(ret, NULL) < 0) {
goto fail;
}
/***************************/
/*
* Acceleration for some specific loop structures. Note
* that with axis coalescing, inputs with more dimensions can
* be reduced to fit into these patterns.
*/
if (!NpyIter_RequiresBuffering(iter)) {
int ndim = NpyIter_GetNDim(iter);
switch (nop) {
case 1:
if (ndim == 2) {
if (unbuffered_loop_nop1_ndim2(iter) < 0) {
goto fail;
}
goto finish;
}
else if (ndim == 3) {
if (unbuffered_loop_nop1_ndim3(iter) < 0) {
goto fail;
}
goto finish;
}
break;
case 2:
if (ndim == 2) {
if (unbuffered_loop_nop2_ndim2(iter) < 0) {
goto fail;
}
goto finish;
}
else if (ndim == 3) {
if (unbuffered_loop_nop2_ndim3(iter) < 0) {
goto fail;
}
goto finish;
}
break;
}
}
/***************************/
if (NpyIter_Reset(iter, NULL) != NPY_SUCCEED) {
goto fail;
}
/*
* Get an inner loop function, specializing it based on
* the strides that are fixed for the whole loop.
*/
NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
sop = get_sum_of_products_function(nop,
NpyIter_GetDescrArray(iter)[0]->type_num,
NpyIter_GetDescrArray(iter)[0]->elsize,
fixed_strides);
#if NPY_EINSUM_DBG_TRACING
NpyIter_DebugPrint(iter);
#endif
/* Finally, the main loop */
if (sop == NULL) {
PyErr_SetString(PyExc_TypeError,
"invalid data type for einsum");
}
else if (NpyIter_GetIterSize(iter) != 0) {
NpyIter_IterNextFunc *iternext;
char **dataptr;
npy_intp *stride;
npy_intp *countptr;
NPY_BEGIN_THREADS_DEF;
iternext = NpyIter_GetIterNext(iter, NULL);
if (iternext == NULL) {
NpyIter_Deallocate(iter);
goto fail;
}
dataptr = NpyIter_GetDataPtrArray(iter);
stride = NpyIter_GetInnerStrideArray(iter);
countptr = NpyIter_GetInnerLoopSizePtr(iter);
NPY_BEGIN_THREADS_NDITER(iter);
NPY_EINSUM_DBG_PRINT("Einsum loop\n");
do {
sop(nop, dataptr, stride, *countptr);
} while(iternext(iter));
NPY_END_THREADS;
/* If the API was needed, it may have thrown an error */
if (NpyIter_IterationNeedsAPI(iter) && PyErr_Occurred()) {
goto fail;
}
}
finish:
if (out != NULL) {
ret = out;
}
Py_INCREF(ret);
NpyIter_Deallocate(iter);
for (iop = 0; iop < nop; ++iop) {
Py_DECREF(op[iop]);
}
return ret;
fail:
for (iop = 0; iop < nop; ++iop) {
Py_XDECREF(op[iop]);
}
return NULL;
}