In [79]:
from numba import cuda
import numpy as np 
import math
import random

In [80]:
n = 1000
k = 64
LARGE = 9999
SMALL = -9999.0

In [81]:
X = np.array(np.random.random((n, k)), dtype=np.float32)
first_val = np.array(np.random.random((n,)), dtype=np.float32)
first_index = np.array(np.random.random((n,)), dtype=np.int)
second_val = np.array(np.random.random((n,)), dtype=np.float32)
second_index = np.array(np.random.random((n,)), dtype=np.int)
third_val = np.array(np.random.random((n,)), dtype=np.float32)
third_index = np.array(np.random.random((n,)), dtype=np.int)

In [82]:
np.random.seed(39)
for i in range(n):
    for j in range(k):
        X[i, j] = np.random.uniform()

In [83]:
X = np.array(X)
first_val = np.array(first_val)
first_index = np.array(first_index)
second_val = np.array(second_val)
second_index = np.array(second_index)
third_val = np.array(third_val)
third_index = np.array(third_index)

In [84]:
X.shape

(1000, 64)

In [85]:
first_val.shape

(1000,)

In [86]:
print(X)

[[0.5468892  0.797899   0.8204019  ... 0.89582473 0.8865771  0.01363686]
 [0.8688695  0.30145565 0.94794893 ... 0.17431284 0.56729794 0.9840169 ]
 [0.52914655 0.43819624 0.91374445 ... 0.27539888 0.67386234 0.7993358 ]
 ...
 [0.4090994  0.20901293 0.4140183  ... 0.28330776 0.75779134 0.45883173]
 [0.8109479  0.40118718 0.5222902  ... 0.97156405 0.66615933 0.41582605]
 [0.05375513 0.60521764 0.273265   ... 0.8919072  0.9904292  0.85547155]]


In [87]:
@cuda.jit('void(float32[:,:],float32[:,:])')
def cuda_mult(X,res):
    """This kernel function will be executed by a thread."""
    row = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y;
    col = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x;
    if ((row >= n) | (col >=n )):
        return
    temp_sum = 0.0
    for i in range(n):
        temp_sum += X[row,i] * X[i,col]
    res[row,col] = temp_sum

In [88]:
@cuda.jit('void(float32[:,:],float32[:],int32[:],float32[:],int32[:],float32[:],int32[:])')
def cuda_dist(X,first_best_val,first_best_index,second_best_val,second_best_index,third_best_val,third_best_index):
    """This kernel function will be executed by a thread."""
    row = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x;
    if ((row >= n)):
        return
    first_best_val[row] = SMALL
    first_best_index[row] = -1
    second_best_val[row] = SMALL
    second_best_index[row] = -1
    third_best_val[row] = SMALL
    third_best_index[row] = -1
    for i in range(n):
        if(i==row):
            continue
        tmp = 0.0
        magnitude1 = 0.0
        magnitude2 = 0.0
        for j in range(k):
            tmp += X[row,j] * X[i,j]
            magnitude1 += (X[row,j]* X[row,j])
            magnitude2 += (X[i,j]* X[i,j])
        tmp /= (math.sqrt(magnitude1)*math.sqrt(magnitude2))
        if(tmp>=first_best_val[row]):
            third_best_val[row] = second_best_val[row]
            third_best_index[row] = second_best_index[row]
            second_best_val[row] = first_best_val[row]
            second_best_index[row] = first_best_index[row]
            first_best_val[row] = tmp
            first_best_index[row] = i
        elif(tmp>=second_best_val[row]):
            third_best_val[row] = second_best_val[row]
            third_best_index[row] = second_best_index[row]
            second_best_val[row] = tmp
            second_best_index[row] = i
        elif(tmp>third_best_val[row]):
            third_best_val[row] = tmp
            third_best_index[row] = i

In [89]:
device = cuda.get_current_device()

In [90]:
device.WARP_SIZE 

32

In [91]:
d_x = cuda.to_device(X)
d_first_val = cuda.device_array_like(first_val)
d_first_index = cuda.device_array_like(first_index)
d_second_val = cuda.device_array_like(second_val)
d_second_index = cuda.device_array_like(second_index)
d_third_val = cuda.device_array_like(third_val)
d_third_index = cuda.device_array_like(third_index)

tpb = device.WARP_SIZE       #blocksize or thread per block
bpg = int(np.ceil((n)/tpb))  #block per grid

In [92]:
tpb, bpg, tpb*bpg

(32, 32, 1024)

In [93]:
%time cuda_dist[bpg,tpb](d_x,d_first_val,d_first_index,d_second_val,d_second_index,d_third_val,d_third_index)

CPU times: user 1.05 ms, sys: 5 µs, total: 1.05 ms
Wall time: 807 µs


In [94]:
# Transfer output from device to host
first_val = d_first_val.copy_to_host()
print (first_val[:10])
# Transfer output from device to host
first_index = d_first_index.copy_to_host()
print (first_index[:10])

[0.8690864  0.85467416 0.88301265 0.8490018  0.85267186 0.8621937
 0.87595296 0.86882657 0.8589149  0.849379  ]
[569  56 608 717 273 268 923 133 736 691]


In [95]:
# Transfer output from device to host
second_val = d_second_val.copy_to_host()
print (second_val[:10])
# Transfer output from device to host
second_index = d_second_index.copy_to_host()
print (second_index[:10])

[0.8519672  0.85417455 0.8783227  0.8452939  0.8482382  0.8562994
 0.85805076 0.859112   0.84838    0.8474211 ]
[984 374 848 581 210 974 960 217 862 758]


In [96]:
# Transfer output from device to host
third_val = d_third_val.copy_to_host()
print (third_val[:10])
# Transfer output from device to host
third_index = d_third_index.copy_to_host()
print (third_index[:10])

[0.850451   0.84890485 0.869244   0.8399661  0.8410316  0.8502314
 0.85478884 0.8560264  0.8478199  0.8473342 ]
[633 909 347 842 613 576 862 409 906 948]


In [97]:
!nvidia-smi

Mon May 20 03:29:57 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.72       Driver Version: 410.72       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   37C    P0    56W / 300W |    454MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------