In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division


""" 
Multiples two square matrices together using multiple blocks and shared memory. 
Each thread block is assigned a "tile" of the resulting matrix and is responsible
for generating the elements in that tile.  Each thread in a block computes one element 
of the tile.
"""
import time
import math
import numpy as np
from scipy import sparse as sp
from numpy import linalg as la
from pycuda import driver, compiler, gpuarray, tools

import os
os.environ['CUDA_DEVICE'] = '0'
# -- initialize the device
import pycuda.autoinit

TestTranspose = False # tests the tranpose matrix product with a vector

filename = '/home/mhazoglou/CUDA_Kernels/Useful_Kernels.cu'

with open(filename) as fid:
    kernel_code = fid.read()

ceil = lambda x: int(math.ceil(x))

# define length of array
L_col = np.int32(2000)#32*32*10)#100000)#1024*1000000)
L_row = np.int32(200000)#1024*1+1)#13*13*13*13)


# create random arrays
a_cpu = sp.rand(L_row, L_col, density = 0.01, format = 'csr', dtype = np.float64)
nnz = a_cpu.nnz
print("Number of nonzero elements:", nnz)
if TestTranspose:
    v_cpu = np.random.randn(L_row, 1).astype(np.float64)
else:
    v_cpu = np.random.randn(L_col, 1).astype(np.float64)


# compute reference on the CPU to verify GPU computation
if TestTranspose:
    t0 = time.time()
    r_cpu = (a_cpu.transpose()).dot(v_cpu)
    print 'On CPU: {} seconds'.format(time.time()-t0)
else:
    t0 = time.time()
    r_cpu = a_cpu.dot(v_cpu)
    print 'On CPU: {} seconds'.format(time.time()-t0)

# transfer host (CPU) memory to device (GPU) memory 
idx = gpuarray.to_gpu(a_cpu.indices)
ptr = gpuarray.to_gpu(a_cpu.indptr)
a_gpu = gpuarray.to_gpu(a_cpu.data)
v_gpu = gpuarray.to_gpu(v_cpu)

# create zero initialized gpu array for the result (R = A * V)
if TestTranspose:
    r_init = np.zeros((L_col, 1), dtype = np.float64)
else:
    r_init = np.zeros((L_row, 1), dtype = np.float64)
r_gpu = gpuarray.to_gpu(r_init)

# compile the kernel code
mod = compiler.SourceModule(kernel_code)

# get the kernel function from the compiled module
if TestTranspose:
    dot = mod.get_function('spmTv_csr_kernel')
    dot.prepare([np.int32, 'P', 'P', 'P', 'P', 'P'])
else:
    dot = mod.get_function('spmv_csr_kernel')
    dot.prepare([np.int32, 'P', 'P', 'P', 'P', 'P'])

# call the kernel on the card
BLOCKSIZE = 1024
grid_x = min(ceil(L_row / BLOCKSIZE), 1024)
t1 = time.time()
dot.prepared_call((grid_x, 1), (BLOCKSIZE, 1, 1),
                  L_row,
                  ptr.gpudata, idx.gpudata, a_gpu.gpudata,
                  v_gpu.gpudata, r_gpu.gpudata)

print 'On GPU: {} seconds'.format(time.time()-t1)

r_gpu = gpuarray.to_gpu(r_init)
t1 = time.time()
# # reinitialized zero gpu array for the result (R = A * V)
# #r_gpu = gpuarray.to_gpu(r_init) #gpuarray.empty((1, ceil(L/TILE_SIZE)), np.float32)

dot.prepared_call((grid_x, 1), (BLOCKSIZE, 1, 1),
                  L_row,
                  ptr.gpudata, idx.gpudata, a_gpu.gpudata,
                  v_gpu.gpudata, r_gpu.gpudata)
print 'On GPU second prepared call: {} seconds'.format(time.time()-t1)

print "-" * 80
print "Matrix A (GPU):"
print a_gpu.get()

print "-" * 80
print "Vector V (GPU):"
print v_gpu.get()

print "-" * 80
print "Vector R (GPU):"
print r_gpu.get()

print "-" * 80
print "Vector R (CPU):"
print r_cpu


print "-" * 80
print "CPU-GPU difference:"
print r_cpu - r_gpu.get()
print "L2 norm:", la.norm(r_cpu - r_gpu.get())
print np.allclose(r_cpu, r_gpu.get())

driver.stop_profiler()

('Number of nonzero elements:', 4000000)
On CPU: 0.00412011146545 seconds
On GPU: 0.000292062759399 seconds
On GPU second prepared call: 9.70363616943e-05 seconds
--------------------------------------------------------------------------------
Matrix A (GPU):
[ 0.22381526  0.58321306  0.82735869 ...,  0.88366254  0.9471917
  0.77406551]
--------------------------------------------------------------------------------
Vector V (GPU):
[[ 0.36687196]
 [ 0.44676566]
 [ 1.61404282]
 ..., 
 [-1.18794056]
 [-0.47910979]
 [-0.77500554]]
--------------------------------------------------------------------------------
Vector R (GPU):
[[-4.36695776]
 [ 1.16036724]
 [ 1.08176592]
 ..., 
 [-0.43955478]
 [-1.74111527]
 [ 0.09272674]]
--------------------------------------------------------------------------------
Vector R (CPU):
[[-4.36695776]
 [ 1.16036724]
 [ 1.08176592]
 ..., 
 [-0.43955478]
 [-1.74111527]
 [ 0.09272674]]
----------------------------------------------------------------------------

In [2]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division


""" 
Multiples two square matrices together using multiple blocks and shared memory. 
Each thread block is assigned a "tile" of the resulting matrix and is responsible
for generating the elements in that tile.  Each thread in a block computes one element 
of the tile.
"""
import timeit
import math
import numpy as np
from scipy import sparse as sp
from numpy import linalg as la
from pycuda import driver, compiler, gpuarray, tools

from WeightsAndBiasInitialization import ptr_to_row

import os
os.environ['CUDA_DEVICE'] = '1'
# -- initialize the device
import pycuda.autoinit



filename = '/home/mhazoglou/CUDA_Kernels/Useful_Kernels.cu'

with open(filename) as fid:
    kernel_code = fid.read()

ceil = lambda x: int(math.ceil(x))

# define length of array
L_col = np.int32(1000)#100000)#1024*1000000)
L_row = np.int32(1000)#1024*1+1)#13*13*13*13)


# create random arrays
a_cpu = sp.rand(L_row, L_col, density = 0.09, format = 'csr')

vx_cpu = np.concatenate((np.ones((int(L_col/2), 1), dtype = np.float64),
                         2*np.ones((int(L_col/2), 1), dtype = np.float64)),
                        axis = 0)
# np.ones((L_col, 1), dtype = np.float64)

vy_cpu = np.concatenate((np.ones((int(L_row/2), 1), dtype = np.float64),
                         np.ones((int(L_row/2), 1), dtype = np.float64)),
                        axis = 0)
# np.ones((L_row, 1), dtype = np.float64)

row_idx = gpuarray.to_gpu(ptr_to_row(a_cpu.indptr))

# transfer host (CPU) memory to device (GPU) memory 
idx = gpuarray.to_gpu(a_cpu.indices)
ptr = gpuarray.to_gpu(a_cpu.indptr)
vx_gpu = gpuarray.to_gpu(vx_cpu)
vy_gpu = gpuarray.to_gpu(vy_cpu)

nnz = np.int32(a_cpu.nnz)
a_gpu = gpuarray.empty((nnz), dtype = np.float64)
b_gpu = gpuarray.empty((nnz), dtype = np.float64)

# compile the kernel code
mod = compiler.SourceModule(kernel_code)

# get the kernel function from the compiled module
out = mod.get_function('spvv_csr_outer_kernel')
out.prepare([np.int32, 'P', 'P', 'P', 'P', 'P'])

out_coo = mod.get_function('spvv_coo_outer_kernel')
out_coo.prepare([np.int32, 'P', 'P', 'P', 'P', 'P'])

repeat = 1000

# call the kernel on the card
BLOCKSIZE = 1024
grid_x = min(ceil(L_row / BLOCKSIZE), 1024)
t1 = timeit.default_timer()
for _ in range(repeat):
    out.prepared_call((grid_x, 1), (BLOCKSIZE, 1, 1),
                      L_row,
                      ptr.gpudata, idx.gpudata, vx_gpu.gpudata,
                      vy_gpu.gpudata, a_gpu.gpudata)

print 'CSR On GPU: {} seconds'.format((timeit.default_timer()-t1) / repeat)


grid_x_coo = min(ceil(nnz / BLOCKSIZE), 1024)
t1 = timeit.default_timer()
for _ in range(repeat):
    out_coo.prepared_call((grid_x_coo, 1), (BLOCKSIZE, 1, 1),
                          nnz,
                          row_idx.gpudata, idx.gpudata,
                          vx_gpu.gpudata, vy_gpu.gpudata, b_gpu.gpudata)
print 'COO On GPU {} seconds'.format((timeit.default_timer() - t1) / repeat)

# t1 = time.time()
# # reinitialized zero gpu array for the result (R = A * V)
# #r_gpu = gpuarray.to_gpu(r_init) #gpuarray.empty((1, ceil(L/TILE_SIZE)), np.float32)

# r_gpu = gpuarray.to_gpu(r_init)
# dot.prepared_call((grid_x, 1), (BLOCKSIZE, 1, 1),
#                   L_row,
#                   ptr.gpudata, idx.gpudata, a_gpu.gpudata,
#                   v_gpu.gpudata, r_gpu.gpudata)
# print 'On GPU second prepared call: {} seconds'.format(time.time()-t1)

print "-" * 80
print "Vector Vx (GPU):"
print vx_gpu.get()

print "-" * 80
print "Vector Vy (GPU):"
print vy_gpu.get()

print "-" * 80
print "Sparse Matrix A (GPU-CSR):"
print a_gpu.get()

print "-" * 80
print "Sparse Matrix A (GPU-COO)"
print b_gpu.get()

print "-" * 80
print "Sparse Matrix A (CPU):"
print a_cpu.data.astype(np.bool)


dif = a_cpu.data.astype(np.bool) - a_gpu.get()
dif2 = a_cpu.data.astype(np.bool) - b_gpu.get()
print "-" * 80
print "CPU-GPU difference:"
print dif, dif2
print 'total:', sum(dif)
print 'total:', sum(dif2)

dif_gpu = a_gpu.get() - b_gpu.get()
print "GPU-GPU difference"
print dif_gpu
print 'total:', sum(dif_gpu)

driver.stop_profiler()

CSR On GPU: 5.69295883179e-06 seconds
COO On GPU 0.000169404029846 seconds
--------------------------------------------------------------------------------
Vector Vx (GPU):
[[ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 

In [1]:
kernel_code = """

__inline__ __device__
double warpReduceSum(double val) {
  for (unsigned int offset = warpSize/2; offset > 0; offset /= 2) 
    val += __shfl_down(val, offset);
  return val;
}

__inline__ __device__
double blockReduceSum(double val) {

  static __shared__ double shared[32]; // Shared mem for 32 partial sums
  unsigned int lane = threadIdx.x % warpSize;
  unsigned int wid = threadIdx.x / warpSize;

  val = warpReduceSum(val);     // Each warp performs partial reduction

  if (lane==0) shared[wid]=val; // Write reduced value to shared memory

  __syncthreads();              // Wait for all partial reductions

  //read from shared memory only if that warp existed
  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.;

  if (wid==0) val = warpReduceSum(val); //Final reduce within first warp

  return val;
}

__global__ void 
Blocked1dCSRMatVecDotKernel(const unsigned int num_rows,
                const unsigned int  *ptr,
                const unsigned int  *indices,
                const double *data,
                const double *x,
                double *y)
{
    
    unsigned int  start_x = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int stride_x = blockDim.x * gridDim.x;
    unsigned int  start_y = blockDim.y * blockIdx.y + threadIdx.y;
    unsigned int stride_y = blockDim.y * gridDim.y;
    
    for (unsigned int row = start_y; row < num_rows; row += stride_y)
    {
        double sum = 0.;
        unsigned int row_start = ptr[row];
        //unsigned int row_end   = ptr[row +1];
        unsigned int row_length = ptr[row +1] - row_start;
        for (unsigned int j = start_x; j < row_length; j += stride_x)
        {
            sum += data[row_start + j] * x[indices[row_start + j]];
        }
        
        sum = blockReduceSum(sum);
        if (threadIdx.x==0)
        {
            atomicAdd(&y[row], sum);
        }
        
    }
}

__global__ void 
BlockedCSRMatVecDotKernel(const unsigned int num_rows,
                const unsigned int  *ptr,
                const unsigned int  *indices,
                const double *data,
                const double *x,
                double *y)
{
    
    unsigned int  start_x = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int stride_x = blockDim.x * gridDim.x;
    unsigned int  start_y = blockDim.y * blockIdx.y + threadIdx.y;
    unsigned int stride_y = blockDim.y * gridDim.y;
    
    for (unsigned int row = start_y; row < num_rows; row += stride_y)
    {
        double sum = 0.;
        unsigned int row_start = ptr[row];
        //unsigned int row_end   = ptr[row +1];
        unsigned int row_length = ptr[row +1] - row_start;
        for (unsigned int j = start_x; j < row_length; j += stride_x)
        {
            sum += data[row_start + j] * x[indices[row_start + j]];
        }
        
        sum = warpReduceSum(sum);
        if (threadIdx.x==0)
        {
            atomicAdd(&y[row], sum);
        }
        
    }
}

/*
// product of a matrix with a vector
// R(L_R, 1) = A(L_R, L_V) V(L_V, 1)
// the result R needs to be initialized with zeros (use ZeroFillKernel) 
__global__ void BlockedCSRMatVecDotKernel(double *A, double *V, double *R, 
        unsigned int *new_idx_V_arr, unsigned int *new_idx_R_arr,
        unsigned int *new_block_arr, unsigned int N_blocks)
{
    unsigned int starting_block = threadIdx.z + blockIdx.z * blockDim.z;
    unsigned int stride_z = blockDim.z * gridDim.z;
    unsigned int starting_col = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int stride_x = blockDim.x * gridDim.x;
    unsigned int starting_row = blockIdx.y * blockDim.y + threadIdx.y;
    unsigned int stride_y = blockDim.y * gridDim.y;
    for (unsigned int block = starting_block;
         block < N_blocks;
         block += stride_z)
    {
        double sum = 0.;
        unsigned int idx_block_start = new_block_arr[block];
        unsigned int idx_V_start = new_idx_V_arr[block];
        unsigned int idx_V_end = new_idx_V_arr[block + 1];
        unsigned int idx_R_start = new_idx_R_arr[block];
        unsigned int idx_R_end = new_idx_R_arr[block + 1];
        
        unsigned int L_V = idx_V_end - idx_V_start;
        unsigned int L_R = idx_R_end - idx_R_start;
        for (unsigned int row = starting_row;
             row < L_R;
             row += stride_y) 
        {
            //reduce multiple elements per thread
            for (unsigned int col = starting_col; 
                 col < L_V; 
                 col += stride_x) {
                //sum += A[idx_block_start + col + row * L_V] * V[idx_V_start + col];
                sum += A[col + row * L_V] * V[col];

            }

            sum = warpReduceSum(sum);
            //static __shared__ double shared[32]; // Shared mem for 32 partial sums
            //shared[threadIdx.y] = sum;


            if (threadIdx.x==0)
            {
                //atomicAdd(&R[idx_R_start + row], sum);//shared[threadIdx.y]);
                atomicAdd(&R[row], sum);//shared[threadIdx.y]);

            }

        }
    }
}
*/
"""

In [2]:
def looped_blockedMatVecProd(A, V, R, new_idx_V, new_idx_R, new_block, N_blocks):
    
    for z in xrange(N_blocks):
        idx_block_start = new_block[z]
        idx_V_start = new_idx_V[z]
        idx_V_end = new_idx_V[z + 1]
        idx_R_start = new_idx_R[z]
        idx_R_end = new_idx_R[z + 1]
        
        L_V = idx_V_end - idx_V_start
        L_R = idx_R_end - idx_R_start
        for row in xrange(L_R):
            for col in xrange(L_V):
                R[idx_R_start + row] += A[idx_block_start + col + row * L_V] * V[idx_V_start + col]
    return R

In [3]:
from RectangularGridConstructor import make_connections
from WeightsAndBiasInitialization import weight_initialize
#!/usr/bin/env python
# -*- coding: utf-8 -*-

structure = [16, 8, 4, 3, 2, 1]
output_sizes = [1, 2*2, 4*4, 8*8, 6*6, 16*16]
input_size = 6 * 6 * 3
hidden_size = 49
connect_dict = make_connections(structure, output_sizes, context_from_top_0_0=False)
i2h, h2op, maps, input_new_unit, op_new_unit = weight_initialize(connect_dict, input_size, hidden_size)

In [4]:
import numpy as np

N_blocks = np.int32(len(connect_dict))
hid_new_unit = range(0, hidden_size * N_blocks  + 1, hidden_size)

In [5]:
def new_block_idx(new_idx_V_arr, new_idx_R_arr):
    block_idx_list = []
    prev_idx_V, prev_idx_R = 0, 0
    for idx_V, idx_R in zip(new_idx_V_arr, new_idx_R_arr):
        L_V = idx_V-prev_idx_V
        L_R = idx_R-prev_idx_R
        if block_idx_list != []:
            block_idx_list.append(block_idx_list[-1] +
                                  L_V * L_R)
            if L_V > max_L_V:
                max_L_V = L_V
            if L_R > max_L_R:
                max_L_R = L_R
        else:
            block_idx_list.append(L_V * L_R)
            max_L_V = L_V
            max_L_R = L_R
        prev_idx_V, prev_idx_R = idx_V, idx_R
    return block_idx_list, max_L_V, max_L_R

In [6]:
new_block_list, max_L_V, max_L_R = new_block_idx(input_new_unit, hid_new_unit)

In [7]:
new_block_list

[0,
 30772,
 63945,
 97118,
 130291,
 163464,
 196637,
 229810,
 262983,
 296156,
 329329,
 362502,
 395675,
 428848,
 462021,
 495194,
 525966,
 559139,
 594713,
 630287,
 665861,
 701435,
 737009,
 772583,
 808157,
 843731,
 879305,
 914879,
 950453,
 986027,
 1021601,
 1057175,
 1090348,
 1123521,
 1159095,
 1194669,
 1230243,
 1265817,
 1301391,
 1336965,
 1372539,
 1408113,
 1443687,
 1479261,
 1514835,
 1550409,
 1585983,
 1621557,
 1654730,
 1687903,
 1723477,
 1759051,
 1794625,
 1830199,
 1865773,
 1901347,
 1936921,
 1972495,
 2008069,
 2043643,
 2079217,
 2114791,
 2150365,
 2185939,
 2219112,
 2252285,
 2287859,
 2323433,
 2359007,
 2394581,
 2430155,
 2465729,
 2501303,
 2536877,
 2572451,
 2608025,
 2643599,
 2679173,
 2714747,
 2750321,
 2783494,
 2816667,
 2852241,
 2887815,
 2923389,
 2958963,
 2994537,
 3030111,
 3065685,
 3101259,
 3136833,
 3172407,
 3207981,
 3243555,
 3279129,
 3314703,
 3347876,
 3381049,
 3416623,
 3452197,
 3487771,
 3523345,
 3558919,
 3594493

In [8]:
input_new_unit, op_new_unit

([0,
  628,
  1305,
  1982,
  2659,
  3336,
  4013,
  4690,
  5367,
  6044,
  6721,
  7398,
  8075,
  8752,
  9429,
  10106,
  10734,
  11411,
  12137,
  12863,
  13589,
  14315,
  15041,
  15767,
  16493,
  17219,
  17945,
  18671,
  19397,
  20123,
  20849,
  21575,
  22252,
  22929,
  23655,
  24381,
  25107,
  25833,
  26559,
  27285,
  28011,
  28737,
  29463,
  30189,
  30915,
  31641,
  32367,
  33093,
  33770,
  34447,
  35173,
  35899,
  36625,
  37351,
  38077,
  38803,
  39529,
  40255,
  40981,
  41707,
  42433,
  43159,
  43885,
  44611,
  45288,
  45965,
  46691,
  47417,
  48143,
  48869,
  49595,
  50321,
  51047,
  51773,
  52499,
  53225,
  53951,
  54677,
  55403,
  56129,
  56806,
  57483,
  58209,
  58935,
  59661,
  60387,
  61113,
  61839,
  62565,
  63291,
  64017,
  64743,
  65469,
  66195,
  66921,
  67647,
  68324,
  69001,
  69727,
  70453,
  71179,
  71905,
  72631,
  73357,
  74083,
  74809,
  75535,
  76261,
  76987,
  77713,
  78439,
  79165,
  79842,
  

In [9]:
from __future__ import division


""" 
Multiples two square matrices together using multiple blocks and shared memory. 
Each thread block is assigned a "tile" of the resulting matrix and is responsible
for generating the elements in that tile.  Each thread in a block computes one element 
of the tile.
"""
import time
import math
from scipy import sparse as sp
from numpy import linalg as la
from pycuda import driver, compiler, gpuarray, tools

import os
os.environ['CUDA_DEVICE'] = '0'
# -- initialize the device
import pycuda.autoinit

TestTranspose = False # tests the tranpose matrix product with a vector

ceil = lambda x: int(math.ceil(x))

In [10]:
# define length of array
# L_col = np.int32(2000)#32*32*10)#100000)#1024*1000000)
# L_row = np.int32(200000)#1024*1+1)#13*13*13*13)

In [11]:
# create random arrays
a_cpu = sp.csr_matrix((i2h[2], i2h[1], i2h[0]), dtype=np.float64)
L_row, L_col = np.int32(a_cpu.shape)
nnz = a_cpu.nnz
print("Number of nonzero elements:", nnz)
if TestTranspose:
    v_cpu = np.random.randn(L_row, 1).astype(np.float64)
else:
    v_cpu = np.random.randn(input_new_unit[-1], 1).astype(np.float64)


# compute reference on the CPU to verify GPU computation
if TestTranspose:
    t0 = time.time()
    r_cpu = (a_cpu.transpose()).dot(v_cpu)
    print 'On CPU: {} seconds'.format(time.time()-t0)
else:
    t0 = time.time()
    r_cpu = a_cpu.dot(v_cpu)
    print 'On CPU: {} seconds'.format(time.time()-t0)

('Number of nonzero elements:', 13798498)
On CPU: 0.0128030776978 seconds


In [12]:
print r_cpu

[[-0.63989466]
 [-0.71248572]
 [ 0.50008168]
 ..., 
 [-0.50035529]
 [ 0.34804616]
 [ 0.08704561]]


In [13]:
# transfer host (CPU) memory to device (GPU) memory 
# new_idx_V = gpuarray.to_gpu(np.array(input_new_unit, dtype=np.uint))
# new_idx_R = gpuarray.to_gpu(np.array(hid_new_unit, dtype=np.uint))
# new_block_arr = gpuarray.to_gpu(np.array(new_block_list, dtype=np.uint))
idx = gpuarray.to_gpu(a_cpu.indices)
ptr = gpuarray.to_gpu(a_cpu.indptr)
a_gpu = gpuarray.to_gpu(a_cpu.data)
v_gpu = gpuarray.to_gpu(v_cpu)

# create zero initialized gpu array for the result (R = A * V)
if TestTranspose:
    r_init = np.zeros((L_col, 1), dtype = np.float64)
else:
    r_init = np.zeros((hid_new_unit[-1], 1), dtype = np.float64)
r_gpu = gpuarray.to_gpu(r_init)

In [14]:
r_gpu.get(), a_gpu.get(), v_gpu.get()

(array([[ 0.],
        [ 0.],
        [ 0.],
        ..., 
        [ 0.],
        [ 0.],
        [ 0.]]),
 array([ 0.03390897, -0.01061022, -0.02943279, ...,  0.01206078,
         0.02631849, -0.02495353]),
 array([[ 0.81108997],
        [-0.75824377],
        [-0.48941608],
        ..., 
        [ 0.80885355],
        [-0.75099613],
        [-1.26248197]]))

In [15]:
# len(new_idx_V.get()), len(new_idx_R.get()), len(new_block_arr.get()), N_blocks

In [19]:
# compile the kernel code
mod = compiler.SourceModule(kernel_code)

# get the kernel function from the compiled module
if TestTranspose:
    dot = mod.get_function('BlockedCSRMatVecDotKernel')
    dot.prepare([np.int32, 'P', 'P', 'P', 'P', 'P'])
else:
    dot = mod.get_function('Blocked1dCSRMatVecDotKernel')
    dot.prepare([np.int32, 'P', 'P', 'P', 'P', 'P'])

# call the kernel on the card
TILESIZE = 32
# grid_x = ceil(max_L_V / TILESIZE)
# L_row = np.int32(L_row)
# grid_y = ceil(L_row / TILESIZE)
maxthreads = 1024
grid_x = ceil(max_L_V / maxthreads)
grid_y = int(L_row)
# L_row = np.int32(L_row)
t1 = time.time()
dot.prepared_call((grid_x, grid_y), (maxthreads, 1, 1),#(TILESIZE, TILESIZE, 1),
                  L_row, ptr.gpudata, idx.gpudata,
                  a_gpu.gpudata, v_gpu.gpudata, r_gpu.gpudata)

print 'On GPU: {} seconds'.format(time.time()-t1)

# r_gpu = gpuarray.to_gpu(r_init)
# t1 = time.time()
# # # reinitialized zero gpu array for the result (R = A * V)
# # #r_gpu = gpuarray.to_gpu(r_init) #gpuarray.empty((1, ceil(L/TILE_SIZE)), np.float32)

# dot.prepared_call((grid_x, 1), (BLOCKSIZE, 1, 1),
#                   L_row,
#                   ptr.gpudata, idx.gpudata, a_gpu.gpudata,
#                   v_gpu.gpudata, r_gpu.gpudata)
# print 'On GPU second prepared call: {} seconds'.format(time.time()-t1)

print "-" * 80
print "Matrix A (GPU):"
print a_gpu.get()

print "-" * 80
print "Vector V (GPU):"
print v_gpu.get()

print "-" * 80
print "Vector R (GPU):"
print r_gpu.get()

print "-" * 80
print "Vector R (CPU):"
print r_cpu


print "-" * 80
print "CPU-GPU difference:"
print r_cpu - r_gpu.get()
print "L2 norm:", la.norm(r_cpu - r_gpu.get())
print np.allclose(r_cpu, r_gpu.get())

driver.stop_profiler()

On GPU: 0.000101089477539 seconds
--------------------------------------------------------------------------------
Matrix A (GPU):
[ 0.03390897 -0.01061022 -0.02943279 ...,  0.01206078  0.02631849
 -0.02495353]
--------------------------------------------------------------------------------
Vector V (GPU):
[[ 0.81108997]
 [-0.75824377]
 [-0.48941608]
 ..., 
 [ 0.80885355]
 [-0.75099613]
 [-1.26248197]]
--------------------------------------------------------------------------------
Vector R (GPU):
[[-2.55957864]
 [-2.84994286]
 [ 2.00032671]
 ..., 
 [-2.00142115]
 [ 1.39218466]
 [ 0.34818243]]
--------------------------------------------------------------------------------
Vector R (CPU):
[[-0.63989466]
 [-0.71248572]
 [ 0.50008168]
 ..., 
 [-0.50035529]
 [ 0.34804616]
 [ 0.08704561]]
--------------------------------------------------------------------------------
CPU-GPU difference:
[[ 1.91968398]
 [ 2.13745715]
 [-1.50024503]
 ..., 
 [ 1.50106587]
 [-1.04413849]
 [-0.26113682]]
L2 no

In [20]:
%%timeit
dot.prepared_call((grid_x, grid_y), (TILESIZE, TILESIZE, 1),
                  L_row, ptr.gpudata, idx.gpudata,
                  a_gpu.gpudata, v_gpu.gpudata, r_gpu.gpudata)

10000 loops, best of 3: 648 µs per loop


In [20]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division


""" 
Multiples two square matrices together using multiple blocks and shared memory. 
Each thread block is assigned a "tile" of the resulting matrix and is responsible
for generating the elements in that tile.  Each thread in a block computes one element 
of the tile.
"""
import time
import math
import numpy as np
from scipy import sparse as sp
from numpy import linalg as la
from pycuda import driver, compiler, gpuarray, tools

import os
os.environ['CUDA_DEVICE'] = '0'
# -- initialize the device
import pycuda.autoinit

TestTranspose = False # tests the tranpose matrix product with a vector

filename = '/home/mhazoglou/CUDA_Kernels/Useful_Kernels.cu'

with open(filename) as fid:
    kernel_code = fid.read()

ceil = lambda x: int(math.ceil(x))

# # define length of array
# L_col = np.int32(2000)#32*32*10)#100000)#1024*1000000)
# L_row = np.int32(200000)#1024*1+1)#13*13*13*13)


# # create random arrays
# a_cpu = sp.rand(L_row, L_col, density = 0.01, format = 'csr', dtype = np.float64)
# nnz = a_cpu.nnz
# print("Number of nonzero elements:", nnz)
# if TestTranspose:
#     v_cpu = np.random.randn(L_row, 1).astype(np.float64)
# else:
#     v_cpu = np.random.randn(L_col, 1).astype(np.float64)


# # compute reference on the CPU to verify GPU computation
# if TestTranspose:
#     t0 = time.time()
#     r_cpu = (a_cpu.transpose()).dot(v_cpu)
#     print 'On CPU: {} seconds'.format(time.time()-t0)
# else:
#     t0 = time.time()
#     r_cpu = a_cpu.dot(v_cpu)
#     print 'On CPU: {} seconds'.format(time.time()-t0)

# # transfer host (CPU) memory to device (GPU) memory 
# idx = gpuarray.to_gpu(a_cpu.indices)
# ptr = gpuarray.to_gpu(a_cpu.indptr)
# a_gpu = gpuarray.to_gpu(a_cpu.data)
# v_gpu = gpuarray.to_gpu(v_cpu)

# # create zero initialized gpu array for the result (R = A * V)
# if TestTranspose:
#     r_init = np.zeros((L_col, 1), dtype = np.float64)
# else:
#     r_init = np.zeros((L_row, 1), dtype = np.float64)
# r_gpu = gpuarray.to_gpu(r_init)

# compile the kernel code
mod = compiler.SourceModule(kernel_code)

# get the kernel function from the compiled module
if TestTranspose:
    dot = mod.get_function('spmTv_csr_kernel')
    dot.prepare([np.int32, 'P', 'P', 'P', 'P', 'P'])
else:
    dot = mod.get_function('spmv_csr_kernel')
    dot.prepare([np.int32, 'P', 'P', 'P', 'P', 'P'])

# call the kernel on the card
BLOCKSIZE = 1024
grid_x = min(ceil(L_row / BLOCKSIZE), 1024)
t1 = time.time()
dot.prepared_call((grid_x, 1), (BLOCKSIZE, 1, 1),
                  L_row,
                  ptr.gpudata, idx.gpudata, a_gpu.gpudata,
                  v_gpu.gpudata, r_gpu.gpudata)

print 'On GPU: {} seconds'.format(time.time()-t1)

r_gpu = gpuarray.to_gpu(r_init)
t1 = time.time()
# # reinitialized zero gpu array for the result (R = A * V)
# #r_gpu = gpuarray.to_gpu(r_init) #gpuarray.empty((1, ceil(L/TILE_SIZE)), np.float32)

dot.prepared_call((grid_x, 1), (BLOCKSIZE, 1, 1),
                  L_row,
                  ptr.gpudata, idx.gpudata, a_gpu.gpudata,
                  v_gpu.gpudata, r_gpu.gpudata)
print 'On GPU second prepared call: {} seconds'.format(time.time()-t1)

print "-" * 80
print "Matrix A (GPU):"
print a_gpu.get()

print "-" * 80
print "Vector V (GPU):"
print v_gpu.get()

print "-" * 80
print "Vector R (GPU):"
print r_gpu.get()

print "-" * 80
print "Vector R (CPU):"
print r_cpu


print "-" * 80
print "CPU-GPU difference:"
print r_cpu - r_gpu.get()
print "L2 norm:", la.norm(r_cpu - r_gpu.get())
print np.allclose(r_cpu, r_gpu.get())

driver.stop_profiler()

On GPU: 0.000104904174805 seconds
On GPU second prepared call: 9.3936920166e-05 seconds
--------------------------------------------------------------------------------
Matrix A (GPU):
[ 0.05709208 -0.03134448  0.01118304 ...,  0.0366709  -0.01318025
  0.00872711]
--------------------------------------------------------------------------------
Vector V (GPU):
[[ 0.32983508]
 [-0.80149479]
 [-1.44243596]
 ..., 
 [-0.5901493 ]
 [ 0.62309832]
 [-1.80843644]]
--------------------------------------------------------------------------------
Vector R (GPU):
[[-0.416356  ]
 [ 1.33046087]
 [ 0.71955558]
 ..., 
 [-0.53568927]
 [-0.36451341]
 [ 0.49713843]]
--------------------------------------------------------------------------------
Vector R (CPU):
[[-0.416356  ]
 [ 1.33046087]
 [ 0.71955558]
 ..., 
 [-0.53568927]
 [-0.36451341]
 [ 0.49713843]]
--------------------------------------------------------------------------------
CPU-GPU difference:
[[  2.77555756e-16]
 [  0.00000000e+00]
 [  3.330

In [21]:
%%timeit
dot.prepared_call((grid_x, 1), (BLOCKSIZE, 1, 1),
                  L_row,
                  ptr.gpudata, idx.gpudata, a_gpu.gpudata,
                  v_gpu.gpudata, r_gpu.gpudata)

1000 loops, best of 3: 5.8 ms per loop
