In [31]:
import numpy as np
import time
import seaborn as sns

import math

# For GPU stuff
from numba import cuda
import numba

In [32]:
# Basic device query

my_gpu = numba.cuda.get_current_device()

print "Running on GPU:", my_gpu.name
cores_per_capability = {
	1: 8,
	2: 32,
	3: 192,
    5: 128,
}

cc = my_gpu.compute_capability
print "Compute capability: ", "%d.%d" %cc, 
majorcc = cc[0]
print "\nNumber of streaming pultiprocess:", my_gpu.MULTIPROCESSOR_COUNT
cores_per_multiprocessor = cores_per_capability[majorcc]
print "Number of cores per multiprocessor:", cores_per_multiprocessor
total_cores = cores_per_multiprocessor * my_gpu.MULTIPROCESSOR_COUNT
print "Number of cores on GPU:", total_cores

xDim = my_gpu.MAX_BLOCK_DIM_X
yDim = my_gpu.MAX_BLOCK_DIM_Y
zDim = my_gpu.MAX_BLOCK_DIM_Z
print "Max dimension size of a thread block (x, y, z): (", xDim, ",", yDim, ",",zDim,")"
print "Maximum number of threads per block:",  my_gpu.MAX_THREADS_PER_BLOCK
xGrid = my_gpu.MAX_GRID_DIM_X
yGrid = my_gpu.MAX_GRID_DIM_X
zGrid = my_gpu.MAX_GRID_DIM_Z
print "Max dimension size of a thread block (x, y, z): (", xGrid, ",", yGrid, ",",zGrid,")"
print "Warp Size:        ", my_gpu.WARP_SIZE


Running on GPU: GeForce GTX TITAN X
Compute capability:  5.2 
Number of streaming pultiprocess: 24
Number of cores per multiprocessor: 128
Number of cores on GPU: 3072
Max dimension size of a thread block (x, y, z): ( 1024 , 1024 , 64 )
Maximum number of threads per block: 1024
Max dimension size of a thread block (x, y, z): ( 2147483647 , 2147483647 , 65535 )
Warp Size:         32


In [116]:
# Squaring each element of an array

npts = 16777216

data = 1.*np.arange(1,npts+1)
print len(data)
print data

16777216
[  1.00000000e+00   2.00000000e+00   3.00000000e+00 ...,   1.67772140e+07
   1.67772150e+07   1.67772160e+07]


In [117]:
def square_CPU(x):
    out = np.zeros(len(x))
    for i in range(0,len(x)):
        out[i] = x[i]*x[i]
        
    return out

In [118]:
# Test it!
start = time.time()
s0 = square_CPU(data)
print "This took %s seconds" % (time.time()-start)
print s0

This took 3.95621204376 seconds
[  1.00000000e+00   4.00000000e+00   9.00000000e+00 ...,   2.81474910e+14
   2.81474943e+14   2.81474977e+14]


In [147]:
# Do the same on the GPU

@numba.cuda.jit("void(float64[:],float64[:])")
def square_GPU(data_in,data_out):
    
    tx = cuda.threadIdx.x #number of threads per block
    bx = cuda.blockIdx.x #number of blocks
    bw = cuda.blockDim.x # Block dimension/width to assign the index when there is more than 1 block
    
    idx = bw*bx + tx
    
    x = data_in[idx]
    
    #data_out[idx] = x*x
    data_out[idx] = math.log(x*x) * math.sin(x) * math.cos(x)
    #data_out[idx] = bx
    
    

In [148]:
#Set the thread count to the number of threads on our GPU
thread_ct = my_gpu.WARP_SIZE
#Set the block count
block_ct = int(math.ceil(float(npts) / thread_ct))

print block_ct,thread_ct

524288 32


In [149]:
start = time.time()
data_out = -999*np.ones(len(data))
square_GPU[block_ct,thread_ct](data,data_out)
print "This took %s seconds" % (time.time()-start)


This took 0.167119026184 seconds


In [150]:
#for d in data_out:
#    print d

In [151]:
print (data_out-s0).sum()

-1.57412230169e+21


In [152]:
start = time.time()
s1 = np.log(data*data) * np.sin(data) * np.cos(data)
print "This took %s seconds" % (time.time()-start)


This took 1.59368300438 seconds


In [154]:
print np.abs((data_out-s1)).sum()

7.59633468156e-09
