# Participants:
Gracia Estrán Buyo: 100452014

Marta Almagro Fuello: 1004591979



In [1]:
#Uncomment next line if you are using Google Colaboratory
#!pip install pycuda

In [2]:
import  numpy  as  np
import  pycuda.autoinit
from    pycuda.compiler import SourceModule
import  pycuda.driver as  drv
import  pycuda.gpuarray as  gpuarray
import  pycuda.tools as gtools
from numpy import linalg as la
from IPython import display
import time

# Guide to use Tiled Algorithms
Whe we wants to use a tiled memory based algorithms, we need to analize the follow steps:
1) Tiled Memory Size: What information we will share across all the execution threads in a execution block
2) Assign the memory position to each execution thread with memory coalesence
3) Fill the Tiled Memory in parallel
4) Assign to some threads the extra data needed for the algorithms
5) Synchronize the filling memory execution
6) Each thread execute his individual task
7) Synchronize the execution task
8) End the execution

# TILED REDUCTION ARRAY
The algorithm of reduction (which calculates the sum of all elements in an array), works as follow:

![image.png](attachment:d6e8dd16-0624-4d98-af7c-5a8f36bd28c5.png)

* Tiled memory size: The tile will contains the twice times the number of threads assigned
* Each thread in the execution block copy the data from global memory to the assigned memory place
* No extra data need (for this task)
* In each iteration
    * Adds the possition asigned and the next available data (indexed by the stride)
    * This will works until the stride exceeds the block size

In [3]:
tiled_reduction_src = """
__global__ void tiled_reduction( float *v, float *c, int N){

  const int BLOCK_SIZE=1024;

  __shared__ float partialSum[ 2 * BLOCK_SIZE ]; //The array dimensions MUST be constants

  unsigned int t = threadIdx.x;
  unsigned int start = 2 * blockIdx.x * blockDim.x;

  
  //fill the tile memory
  //each thread will fill the memory position start +t and start+blockDim.x+t
  //look, each consecutive execution thread (threadIdx.x) will access to coalesced memory in both steps
  
  if ( (start+t) < N) 
    partialSum[t]=v[start+t];
  else 
    partialSum[t]=0;

  if ((start+blockDim.x+t) < N)
    partialSum[blockDim.x+t]=v[start+blockDim.x+t];
  else
    partialSum[blockDim.x+t]=0;

  //Here we will wait until all execution threads fills te memory
  __syncthreads();

  for ( unsigned int stride=1; stride <=blockDim.x; stride*=2 ) {
    __syncthreads();
    if ( t % stride == 0)
      partialSum[2*t]+=partialSum[2*t+stride];
  }
  c[blockIdx.x]=partialSum[0];
}
"""

Here we can not edit automatically the source template to use the string % function to replace character chains in the string variable by other values. 

The problem to use modern strings formating (format method or f-strings) is the collision of the use of {} symbols, and in the previos source code, the presence of modulus operator (%) collides with the string subtitution.

In further codes we will use it to be able to substitute constants values from external variables.

In [4]:
#Is the BLOCK_SIZE A GOOD BLOCK SIZE????
BLOCK_SIZE=1024

In [5]:
print(tiled_reduction_src)


__global__ void tiled_reduction( float *v, float *c, int N){

  const int BLOCK_SIZE=1024;

  __shared__ float partialSum[ 2 * BLOCK_SIZE ]; //The array dimensions MUST be constants

  unsigned int t = threadIdx.x;
  unsigned int start = 2 * blockIdx.x * blockDim.x;

  
  //fill the tile memory
  //each thread will fill the memory position start +t and start+blockDim.x+t
  //look, each consecutive execution thread (threadIdx.x) will access to coalesced memory in both steps
  
  if ( (start+t) < N) 
    partialSum[t]=v[start+t];
  else 
    partialSum[t]=0;

  if ((start+blockDim.x+t) < N)
    partialSum[blockDim.x+t]=v[start+blockDim.x+t];
  else
    partialSum[blockDim.x+t]=0;

  //Here we will wait until all execution threads fills te memory
  __syncthreads();

  for ( unsigned int stride=1; stride <=blockDim.x; stride*=2 ) {
    __syncthreads();
    if ( t % stride == 0)
      partialSum[2*t]+=partialSum[2*t+stride];
  }
  c[blockIdx.x]=partialSum[0];
}



In [6]:
mod = SourceModule(tiled_reduction_src)

In [7]:
datasize=np.int32(1000000)

In [8]:
tiled_reduction = mod.get_function("tiled_reduction")

In [9]:
data = np.random.randn(datasize).astype(np.float32)
data_gpu=gpuarray.to_gpu(data)

In [10]:
block_size=(int(BLOCK_SIZE),1,1)

In [11]:
numblocks = int(np.ceil(datasize/BLOCK_SIZE))

In [12]:
c_gpu=gpuarray.empty((numblocks,1),np.float32)

In [13]:
grid_size=(numblocks,1)

In [14]:
start_t = time.time()
tiled_reduction(data_gpu,
                c_gpu,
                datasize,
                grid=grid_size,
                block=block_size)
end_t=time.time

In [15]:
c=c_gpu.get()

In [16]:
sum(data)

1926.5502085274682

In [17]:
np.sum(c)

1926.55

# 1D Convolution

The algorithm to implement will calculate the convolution between 2 arrays.

The shortest array, called system mask, system response, represents the exit of a system to an special signal called Dirac's delta (signal of infinite height, but limited area under the curve).

The second array (the longest one) is the signal to be shaped by our system.

Based on this mathematical operation the filters works.

The Image Filter algorithms are 2 dimensional convolutions.<br>

![image.png](attachment:17f3b373-5760-4edb-b6b3-a233aef56fe3.png)

The problem with the tile algorithms are we need extra data to calculate the the correct convolution (halos)<br>

![image.png](attachment:e3bc106b-14eb-4935-8de3-549c7a225578.png)

Then, our steps to implement the algorithm will be:

* Tiled memory size: The tile will contains not only the block size elements, but also the system mask length - 1, to store the halos. Also, we need to store in memory the shared memory the system mask. 
* Each thread in the execution block copy the data from global memory to the assigned memory place, and few of them will fill the halos.<br>

![image.png](attachment:3a76556c-c652-49c4-821a-c5be890b13aa.png)

<br/>

![image.png](attachment:83d342d6-57dc-439a-8dc7-190a5dc8a10b.png)
<br/>

![image.png](attachment:5d4f3aa2-d27b-4639-94fc-e9b3ae706820.png)

<br/>
* Once filled the assigned memory positions, we have to wait for the other tasks (\_\_syncthreads())
* Now, we have to calculate the convolution between the system mask and the assigned memory position in the signal vector.

You have to implement the algorithms in the follow cell.



In [18]:
convolution_src = """
__global__ void convolution( float *v, 
                             float *c, 
                             float *conv,
                             int N,
                             int c_size){

  __shared__ float tile[1024]; // We saw that with 512 could diverge and with 1024  converges

  __shared__ float mask[5]; // five is the length of the mask


  // We start by defining the variables
  int t = threadIdx.x;
  int start = blockIdx.x * blockDim.x;
  int halo = 2;

  // Fill the tile memory
  if ((start+t) < N) {
    tile[2+t] = v[start+t];
  } else {
    tile[2+t] = 0;
  }

    // To load the left halo 
  if (t < halo) {
    if ((start-halo+t) < 0) {
      tile[t] = 0; //ghost cell
    } else {
      tile[t] = v[start-halo+t];
    }
  }

   // To load the right halo
  if ((t+halo) >= 512) {
    if ((start+halo+t) >= N) {
      tile[t+halo] = 0; // ghost cell
    } else {
      tile[t+halo] = v[start+halo+t];
    }
  }
  __syncthreads();

  //fill the mask
  if (t < 5) {
    mask[t] = conv[t];
  }
  __syncthreads();

  // multiplicate and sum it (convolution definition)
  float accu=0;
  for (int i=0; i<5; i++) {
    accu += mask[i]*tile[t+i];
  }
  c[start+t]=accu;

}
"""

In [19]:
filtermask=np.array([1,1,3,1,1],dtype=np.float32)
filtermask_gpu=gpuarray.to_gpu(filtermask)
filtermask_size=np.int32(5)

In [20]:
convolved_gpu=gpuarray.empty((datasize,1),np.float32)

In [21]:
BLOCK_SIZE = 1024 

In [22]:
mod2 = SourceModule(convolution_src)

In [23]:
convolution = mod2.get_function("convolution")

In [24]:
block_size=(BLOCK_SIZE,1,1)

In [25]:
numblocks = int(np.ceil(datasize/BLOCK_SIZE))
grid_size=(numblocks,1)

In [26]:
start_t = time.time()
convolution(data_gpu,
            convolved_gpu,
            filtermask_gpu,
            datasize,
            filtermask_size,
            grid=grid_size,
            block=block_size)
end_t=time.time

In [27]:
local_convolved = np.convolve(data,filtermask, mode = 'full')
local_convolved[2:-2]

array([-4.3711133 , -2.5286596 , -5.394397  , ..., -2.579719  ,
       -0.12395483,  1.7146522 ], dtype=float32)

In [28]:
convolved = convolved_gpu.get()
convolved

array([[-4.3711133],
       [-2.5286593],
       [-5.394397 ],
       ...,
       [ 1.7146521],
       [ 1.1187958],
       [ 0.7496468]], dtype=float32)

We can observe that the results that we got with the numpy function and the with the tile algorithm is the same. We start noticing the difference in the seventh decimal in most of the cases. 