In [28]:
from numba import cuda
import numpy as np 
import math
import random

In [29]:
n = 1000
k = 64
LARGE = 9999
SMALL = -9999.0

In [30]:
X = np.array(np.random.random((n, k)), dtype=np.float32)
res_val = np.array(np.random.random((n,)), dtype=np.float32)
res_index = np.array(np.random.random((n,)), dtype=np.float32)

In [31]:
np.random.seed(39)
for i in range(n):
    for j in range(k):
        X[i, j] = np.random.uniform()

In [32]:
X = np.array(X)
res_val = np.array(res_val)
res_index = np.array(res_index)

In [33]:
X.shape

(1000, 64)

In [34]:
res_val.shape

(1000,)

In [35]:
print(X)

[[0.5468892  0.797899   0.8204019  ... 0.89582473 0.8865771  0.01363686]
 [0.8688695  0.30145565 0.94794893 ... 0.17431284 0.56729794 0.9840169 ]
 [0.52914655 0.43819624 0.91374445 ... 0.27539888 0.67386234 0.7993358 ]
 ...
 [0.4090994  0.20901293 0.4140183  ... 0.28330776 0.75779134 0.45883173]
 [0.8109479  0.40118718 0.5222902  ... 0.97156405 0.66615933 0.41582605]
 [0.05375513 0.60521764 0.273265   ... 0.8919072  0.9904292  0.85547155]]


In [36]:
# f= open("valX.txt","w")

In [37]:
'''
for i in range(n):
    f.write("[")
    for j in range(k):
        if(j!=(k-1)):
            f.write(str(X[i,j])+",")
        else:
            f.write(str(X[i,j]))
    f.write("],\n")
f.close()
'''

'\nfor i in range(n):\n    f.write("[")\n    for j in range(k):\n        if(j!=(k-1)):\n            f.write(str(X[i,j])+",")\n        else:\n            f.write(str(X[i,j]))\n    f.write("],\n")\nf.close()\n'

In [38]:
@cuda.jit('void(float32[:,:],float32[:,:])')
def cuda_mult(X,res):
    """This kernel function will be executed by a thread."""
    row = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y;
    col = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x;
    if ((row >= n) | (col >=n )):
        return
    temp_sum = 0.0
    for i in range(n):
        temp_sum += X[row,i] * X[i,col]
    res[row,col] = temp_sum

In [39]:
@cuda.jit('void(float32[:,:],float32[:],float32[:])')
def cuda_dist(X,res_val,res_index):
    """This kernel function will be executed by a thread."""
    row = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x;
    if ((row >= n)):
        return
    min_dist = SMALL
    min_index = -1
    for i in range(n):
        if(i==row):
            continue
        tmp = 0.0
        magnitude1 = 0.0
        magnitude2 = 0.0
        for j in range(k):
            tmp += X[row,j] * X[i,j]
            magnitude1 += (X[row,j]* X[row,j])
            magnitude2 += (X[i,j]* X[i,j])
        tmp /= (math.sqrt(magnitude1)*math.sqrt(magnitude2))
        if(tmp>min_dist):
            min_dist = tmp
            min_index = i
    res_val[row] = min_dist
    res_index[row] = min_index

In [40]:
device = cuda.get_current_device()

In [41]:
device.WARP_SIZE 

32

In [42]:
d_x = cuda.to_device(X)
d_res_val = cuda.device_array_like(res_val)
d_res_index = cuda.device_array_like(res_index)

tpb = device.WARP_SIZE       #blocksize or thread per block
bpg = int(np.ceil((n)/tpb))  #block per grid

In [43]:
tpb, bpg, tpb*bpg

(32, 32, 1024)

In [44]:
%time cuda_dist[bpg,tpb](d_x,d_res_val,d_res_index)

CPU times: user 0 ns, sys: 608 µs, total: 608 µs
Wall time: 497 µs


In [45]:
# Transfer output from device to host
res_val = d_res_val.copy_to_host()
print (res_val)

[0.8690864  0.85467416 0.88301265 0.8490018  0.85267186 0.8621937
 0.87595296 0.86882657 0.8589149  0.849379   0.87923306 0.86790735
 0.83443385 0.84318024 0.8517261  0.8392454  0.85111845 0.8466477
 0.86052084 0.8483591  0.864106   0.8529485  0.8577581  0.8268551
 0.8429862  0.85430205 0.8507139  0.8579091  0.84815866 0.8809168
 0.8461517  0.8659869  0.829557   0.862795   0.8584633  0.85341126
 0.8644989  0.86239195 0.8822282  0.84876364 0.88062954 0.8494619
 0.858836   0.8759234  0.8487768  0.8623828  0.8494772  0.8650317
 0.8666638  0.8506872  0.84836304 0.8534516  0.87923306 0.8687139
 0.862795   0.86774546 0.87171036 0.84634554 0.87875843 0.8639399
 0.8559236  0.87744945 0.86189026 0.84074134 0.8745966  0.85650796
 0.85151803 0.8652786  0.8320228  0.8696339  0.8695399  0.85231453
 0.8743733  0.86814976 0.88977194 0.8647865  0.865956   0.8429384
 0.8454307  0.843751   0.821149   0.86754817 0.8694855  0.8466128
 0.8719161  0.8474679  0.8627223  0.87483823 0.8361943  0.8593121
 0.848

In [46]:
# Transfer output from device to host
res_index = d_res_index.copy_to_host()
print (res_index)

[569.  56. 608. 717. 273. 268. 923. 133. 736. 691.  52. 279. 324. 241.
 820. 772.  71. 663. 481.  94. 881. 159. 494. 829.  67. 948. 527. 208.
 480. 597. 321. 654. 582.  54. 180. 445. 716.  72. 848. 109. 906. 432.
 571. 272. 158. 292. 898. 746. 437. 855. 868. 566.  10. 241.  33. 450.
 439. 736. 848. 643. 424. 383. 607. 850. 272. 848. 866. 848. 967. 110.
 693. 386. 848. 376. 517. 974. 929. 949.  58. 789. 518.  72. 716.  55.
  29. 854. 736. 561. 250. 401. 197. 582. 575. 868.  58. 564. 620. 999.
 360. 246. 837. 163. 123. 567. 307. 439.  11. 581. 730. 174.  69. 330.
 231. 518. 848. 141. 559. 390. 808. 700. 130. 986.  67. 102. 848. 273.
 914. 401. 930. 658. 981. 959. 197.   7. 522.  34. 526. 630. 220. 848.
 494. 115. 359.  43.  72. 899. 231. 564. 518.  72. 504. 212. 807. 979.
 461. 807. 544. 390.  44. 717. 687. 974. 886. 172. 693. 519. 835. 444.
 272. 208. 336. 724. 163. 628. 109. 348. 736. 250. 914. 493.  34.  97.
 208. 628. 288. 229. 428. 874. 397.  81. 523. 762. 527. 253.  11. 292.
 768. 

In [79]:
!nvidia-smi

Fri May 17 10:39:20 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.72       Driver Version: 410.72       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   37C    P0    59W / 300W |   1839MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------