Skip to content

Commit

Permalink
Merge branch 'feature/sumd-npr' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
paboyle committed Mar 16, 2022
2 parents f99c366 + b615fa0 commit 605cf40
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 8 deletions.
25 changes: 25 additions & 0 deletions Grid/lattice/Lattice_reduction.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,15 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
return sumD_cpu(arg,osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
return sumD_gpu_large(arg,osites);
#else
return sumD_cpu(arg,osites);
#endif
}

template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
Expand All @@ -159,6 +168,22 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
return ssum;
}

template<class vobj>
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_gpu_large(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_cpu(&arg_v[0],osites);
#endif
arg.Grid()->GlobalSum(ssum);
return ssum;
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// Deterministic Reduction operations
////////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down
73 changes: 65 additions & 8 deletions Grid/lattice/Lattice_reduction_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ unsigned int nextPow2(Iterator x) {
}

template <class Iterator>
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {

int device;
#ifdef GRID_CUDA
Expand All @@ -37,13 +37,13 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock;
Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;

/*
std::cout << GridLogDebug << "GPU has:" << std::endl;
std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl;
std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;

*/
if (warpSize != WARP_SIZE) {
std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
exit(EXIT_FAILURE);
Expand All @@ -53,12 +53,12 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
threads = warpSize;
if ( threads*sizeofsobj > sharedMemPerBlock ) {
std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
exit(EXIT_FAILURE);
return 0;
}
while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
// keep all the streaming multiprocessors busy
blocks = nextPow2(multiProcessorCount);

return 1;
}

template <class sobj, class Iterator>
Expand Down Expand Up @@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
typedef decltype(lat) Iterator;
Expand All @@ -207,7 +207,9 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
Integer size = osites*nsimd;

Integer numThreads, numBlocks;
getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
assert(ok);

Integer smemSize = numThreads * sizeof(sobj);

Vector<sobj> buffer(numBlocks);
Expand All @@ -218,6 +220,54 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
auto result = buffer_v[0];
return result;
}

template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobj;
sobj ret;
scalarD *ret_p = (scalarD *)&ret;

const int words = sizeof(vobj)/sizeof(vector);

Vector<vector> buffer(osites);
vector *dat = (vector *)lat;
vector *buf = &buffer[0];
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];
for(int w=0;w<words;w++) {

accelerator_for(ss,osites,1,{
buf[ss] = dat[ss*words+w];
});

ret_p[w] = sumD_gpu_small(tbuf,osites);
}
return ret;
}

template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobj;
sobj ret;

Integer nsimd= vobj::Nsimd();
Integer size = osites*nsimd;
Integer numThreads, numBlocks;
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);

if ( ok ) {
ret = sumD_gpu_small(lat,osites);
} else {
ret = sumD_gpu_large(lat,osites);
}
return ret;
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Return as same precision as input performing reduction in double precision though
/////////////////////////////////////////////////////////////////////////////////////////////////////////
Expand All @@ -230,6 +280,13 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
return result;
}


template <class vobj>
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu_large(lat,osites);
return result;
}

NAMESPACE_END(Grid);

0 comments on commit 605cf40

Please sign in to comment.