In [1]:
import pycuda
import pycuda.driver as cuda
import pycuda.autoinit

from pycuda.compiler import SourceModule

import numpy as np
import time

print(pycuda.VERSION)

"""
Matrix transpose example
NxM mat
"""

(2019, 1, 1)


'\nMatrix transpose example\nNxM mat\n'

In [2]:
MAT_SIZE = (1024, 64) # row, col
RES_SIZE = (MAT_SIZE[1], MAT_SIZE[0])

In [3]:
# Create a tensor and copy it to gpu memory
# Randomly initialize a vector
#a = np.ones(shape=(64*64), dtype=np.float32)
a = np.array(range(1024*64), dtype=np.float32).reshape(MAT_SIZE)
# Allocate memory at device
a_gpu = cuda.mem_alloc(a.nbytes)
# Allocate same size of memory for result vector
result = np.zeros(shape=RES_SIZE, dtype=np.float32)
result_gpu = cuda.mem_alloc(result.nbytes)
# Copy the cpu vector to gpu
cuda.memcpy_htod(a_gpu, a)
# width and height
w = np.array([1024], dtype=np.int)
h = np.array([1024], dtype=np.int)
width = cuda.mem_alloc(w.nbytes)
height = cuda.mem_alloc(h.nbytes)

cuda.memcpy_htod(width, w)
cuda.memcpy_htod(height, h)

In [4]:
# Square matrix
mod = SourceModule("""
    __global__ void transpose(float *a, float *res)
    {
        int width = gridDim.x * blockDim.x;
        int height = gridDim.y * blockDim.y;
        // original row and column indices
        // think row, column instead x,y
        int rowIdx = blockDim.x * blockIdx.x + threadIdx.x;
        int colIdx = blockDim.y * blockIdx.y + threadIdx.y;
        
        int tIdx = width * colIdx + rowIdx;
        int oIdx = height * rowIdx + colIdx;
        res[tIdx] = a[oIdx];
    }
""")

In [5]:
# Let's compare operating time
# GPU
startTime = time.time()
func = mod.get_function("transpose")
func(a_gpu,result_gpu, block=(16,16,1), grid =(64,4))
consumedTime = time.time() - startTime
print("Time for gpu operation : ", consumedTime)


Time for gpu operation :  0.0016679763793945312


In [6]:
# CPU
startTime = time.time()
result_cpu = np.transpose(a)
consumedTime = time.time() - startTime
print("Time for cpu operation : ", consumedTime)

Time for cpu operation :  0.00026702880859375


In [7]:
# Copy the result from device to host
aaa = np.zeros(shape=MAT_SIZE, dtype=np.float32)
cuda.memcpy_dtoh(result, result_gpu)
cuda.memcpy_dtoh(aaa, a_gpu)
a_gpu.free()
result_gpu.free()
width.free()
height.free()

In [10]:
# Compare the results
print("Is it same? : ", (result == result_cpu).all())

Is it same? :  True


In [11]:
result_cpu

array([[0.0000e+00, 6.4000e+01, 1.2800e+02, ..., 6.5344e+04, 6.5408e+04,
        6.5472e+04],
       [1.0000e+00, 6.5000e+01, 1.2900e+02, ..., 6.5345e+04, 6.5409e+04,
        6.5473e+04],
       [2.0000e+00, 6.6000e+01, 1.3000e+02, ..., 6.5346e+04, 6.5410e+04,
        6.5474e+04],
       ...,
       [6.1000e+01, 1.2500e+02, 1.8900e+02, ..., 6.5405e+04, 6.5469e+04,
        6.5533e+04],
       [6.2000e+01, 1.2600e+02, 1.9000e+02, ..., 6.5406e+04, 6.5470e+04,
        6.5534e+04],
       [6.3000e+01, 1.2700e+02, 1.9100e+02, ..., 6.5407e+04, 6.5471e+04,
        6.5535e+04]], dtype=float32)

In [12]:
result

array([[0.0000e+00, 6.4000e+01, 1.2800e+02, ..., 6.5344e+04, 6.5408e+04,
        6.5472e+04],
       [1.0000e+00, 6.5000e+01, 1.2900e+02, ..., 6.5345e+04, 6.5409e+04,
        6.5473e+04],
       [2.0000e+00, 6.6000e+01, 1.3000e+02, ..., 6.5346e+04, 6.5410e+04,
        6.5474e+04],
       ...,
       [6.1000e+01, 1.2500e+02, 1.8900e+02, ..., 6.5405e+04, 6.5469e+04,
        6.5533e+04],
       [6.2000e+01, 1.2600e+02, 1.9000e+02, ..., 6.5406e+04, 6.5470e+04,
        6.5534e+04],
       [6.3000e+01, 1.2700e+02, 1.9100e+02, ..., 6.5407e+04, 6.5471e+04,
        6.5535e+04]], dtype=float32)