<a href="https://colab.research.google.com/github/pabloperfer/C/blob/master/MatrixTranspositionwithSharedMemory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from numba.cuda.stubs import syncthreads
import numpy as np
from numba import cuda
import numba.types

TILE_SIZE=32

@cuda.jit
def transpose(a_in, a_out):
  #Explicitly calculate indices rather than using cuda.grid(2)
  tile = cuda.shared.array((TILE_SIZE, TILE_SIZE), numba.types.int32)
  row = cuda.blockIdx.x * TILE_SIZE + cuda.threadIdx.x
  col = cuda.blockIdx.y * TILE_SIZE + cuda.threadIdx.y
  
  
  for z in range (0,TILE_SIZE,TILE_SIZE):
    tile[cuda.threadIdx.y + z,cuda.threadIdx.x] = a_in[col + z, row ]
                                         
  cuda.syncthreads()
 
  for i in range (0,TILE_SIZE,TILE_SIZE):
    a_out[row,col + i ] = tile[cuda.threadIdx.x][cuda.threadIdx.y + i]
    
  


In [2]:
size = 16384
a_in = cuda.to_device(np.arange(size*size,dtype=np.int32).reshape((size,size)))
a_out = cuda.device_array_like(a_in)

print(a_in.copy_to_host())
print(a_out.copy_to_host())

[[        0         1         2 ...     16381     16382     16383]
 [    16384     16385     16386 ...     32765     32766     32767]
 [    32768     32769     32770 ...     49149     49150     49151]
 ...
 [268386304 268386305 268386306 ... 268402685 268402686 268402687]
 [268402688 268402689 268402690 ... 268419069 268419070 268419071]
 [268419072 268419073 268419074 ... 268435453 268435454 268435455]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [3]:
threads_per_block=(32,32)
blocks_per_grid=(int(size/threads_per_block[0]),int(size/threads_per_block[1]))

In [4]:
timeit transpose[blocks_per_grid, threads_per_block](a_in,a_out); cuda.synchronize();print(a_out.copy_to_host())

[[        0         1         2 ... 267911197 267911198 267911199]
 [    16384     16385     16386 ... 267927581 267927582 267927583]
 [    32768     32769     32770 ... 267943965 267943966 267943967]
 ...
 [   491488    491489    491490 ... 268402685 268402686 268402687]
 [   507872    507873    507874 ... 268419069 268419070 268419071]
 [   524256    524257    524258 ... 268435453 268435454 268435455]]
[[        0         1         2 ... 267911197 267911198 267911199]
 [    16384     16385     16386 ... 267927581 267927582 267927583]
 [    32768     32769     32770 ... 267943965 267943966 267943967]
 ...
 [   491488    491489    491490 ... 268402685 268402686 268402687]
 [   507872    507873    507874 ... 268419069 268419070 268419071]
 [   524256    524257    524258 ... 268435453 268435454 268435455]]
[[        0         1         2 ... 267911197 267911198 267911199]
 [    16384     16385     16386 ... 267927581 267927582 267927583]
 [    32768     32769     32770 ... 267943965 2679