In [1]:
from __future__ import division
import numpy as np
import math as mt

In [2]:
ceil = lambda x: int(mt.ceil(x))
L = np.int32(12) #np.int32(1024*5)
threadsize = (1024, 1, 1)# (max(min(int(L), 1024), 32), 1, 1)
gridsize = (min(ceil(L/threadsize[0]), 2147483647), 1)
A = 10*np.random.rand(L)

In [3]:
gridsize

(1, 1)

In [4]:
def lazy_arg_min(A):
    L = A.size
    for i in xrange(L):
        a = A[i]
        if i == 0:
            min_val = a
            arg_min = 0
        elif a < min_val:
            min_val = a
            arg_min = i
    return arg_min, min_val

In [5]:
from pycuda import driver, compiler, gpuarray, tools

import os
os.environ['CUDA_DEVICE'] = '1'

import pycuda.autoinit

In [6]:
kernel_code = """#define CUDART_INF_D __longlong_as_double(0x7ff0000000000000)

__inline__ __device__
void warpReduceMin(double* val, uint* arg) 
{
    for (unsigned int offset = warpSize/2; offset > 0; offset /= 2)
        {
            double tmp_val = __shfl_down(*val, offset);
            uint   tmp_arg = __shfl_down(*arg, offset);
            /*
            tmp_arg can never actually be zero because of 
            __shfl_down resulting in a zero value indicates 
            a thread that is inactive (an undefined result)
            */
            if (tmp_val < *val)
            {
                *val = tmp_val;
                *arg = tmp_arg;
            }
        }
        
}


__inline__ __device__
void blockReduceMin(double *val, 
                    uint *arg) 
{

    static __shared__ double shared_val[32]; // Shared mem for 32 partial mins
    static __shared__ uint shared_arg[32]; // shared mem for 32 partial argmins
    unsigned int lane = threadIdx.x % warpSize;
    unsigned int wid = threadIdx.x / warpSize;

    warpReduceMin(val, arg);     // Each warp performs partial reduction

    if (lane==0)
    {
        shared_val[wid] = *val; // Write reduced value to shared memory
        shared_arg[wid] = *arg;
    }

    __syncthreads();              // Wait for all partial reductions

    // read from shared memory only if that warp existed
    // if we have a BlockDim.x which is smaller than the warpSize (32) we have a problem  
    *val = (threadIdx.x < blockDim.x / warpSize) ? shared_val[lane] : CUDART_INF_D;
    *arg = shared_arg[lane];


    if (wid==0) warpReduceMin(val, arg); //Final reduce within first warp

}

__global__ void MinKernel(double* A,
                          double* min,
                          uint* arg_min,
                          uint L)
{
    uint grid_stride_arg_min;
    double grid_stride_min, a;
    
    uint start = threadIdx.x + blockIdx.x * blockDim.x;
    uint stride = blockDim.x * gridDim.x;
    
    // a way of picking out threads that are beyond
    // the length of the array
    if (start < L)
    {
        a = A[start];
        grid_stride_min = a;
        grid_stride_arg_min = start;
    }
    else
    {
        a = CUDART_INF_D;
        grid_stride_min = CUDART_INF_D;
        grid_stride_arg_min = 0;
    }
    
    // grid stride loop to find the min and argmin across
    // grid strides
    for (uint i = start;
         i < L;
         i += stride)
        {
            a = A[i];
            if (a < grid_stride_min)
            {
                grid_stride_min = a;
                grid_stride_arg_min = i;
            }
        }
    
    // blockReduceMin gives the minimum and argmin of 
    // each block in grid_stride_min and grid_stride_arg_min
    blockReduceMin(&grid_stride_min, &grid_stride_arg_min);
    if (threadIdx.x == 0)
    {
        min[blockIdx.x] = grid_stride_min;
        arg_min[blockIdx.x] = grid_stride_arg_min;
    }
    
}
"""

In [7]:
mod = compiler.SourceModule(kernel_code)
gpu_min = mod.get_function('MinKernel')
gpu_min.prepare(['P', 'P', 'P', np.int32])

<pycuda._driver.Function at 0x7f037a6f9b18>

In [8]:
A_gpu = gpuarray.to_gpu(A)
min_gpu = gpuarray.to_gpu(A[:gridsize[0]])
arg_min_gpu = gpuarray.to_gpu(np.array([0]*gridsize[0], dtype=np.uint32))

In [9]:
%%time
gpu_min.prepared_call(gridsize, threadsize,
                        A_gpu.gpudata, 
                        min_gpu.gpudata, 
                        arg_min_gpu.gpudata, 
                        L)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 388 µs


In [10]:
arg_min_gpu.get(), min_gpu.get()

(array([11], dtype=uint32), array([ 2.2103547]))

In [11]:
%%time
lazy_arg_min(A)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 15 µs


(11, 2.2103546957744022)

In [12]:
%%time
np.argmin(A), np.min(A)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 34.8 µs


(11, 2.2103546957744022)