<a href="https://colab.research.google.com/github/millacurafa/AdvancedDataScience/blob/main/Ejemplo_PyCUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pycuda

Collecting pycuda
[?25l  Downloading https://files.pythonhosted.org/packages/46/61/47d3235a4c13eec5a5f03594ddb268f4858734e02980afbcd806e6242fa5/pycuda-2020.1.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 11.1MB/s 
[?25hCollecting pytools>=2011.2
[?25l  Downloading https://files.pythonhosted.org/packages/b7/30/c9362a282ef89106768cba9d884f4b2e4f5dc6881d0c19b478d2a710b82b/pytools-2020.4.3.tar.gz (62kB)
[K     |████████████████████████████████| 71kB 10.2MB/s 
Collecting appdirs>=1.4.0
  Downloading https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl
Collecting mako
[?25l  Downloading https://files.pythonhosted.org/packages/a6/37/0e706200d22172eb8fa17d68a7ae22dec7631a0a92266634fb518a88a5b2/Mako-1.1.3-py2.py3-none-any.whl (75kB)
[K     |████████████████████████████████| 81kB 12.2MB/s 
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (setup.py) .

In [None]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

import numpy

#Se crea matriz de 4x4 con números del 0 al 9
a = numpy.random.randint(10, size=(8,8))
#Se convierte el tipo de dato de int64 a int32
a = a.astype(numpy.int32)

#Se asigna la la memoria en la GPU con la cantidad de bytes que ocupa la matriz a 
a_gpu = cuda.mem_alloc(a.nbytes)
#Se realiza la copia de los datos de host -> device
cuda.memcpy_htod(a_gpu, a)

#Kernel CUDA, es la función que va a ejecutar cada hebra instanciada
# Documento para índices de hebras https://cs.calvin.edu/courses/cs/374/CUDA/CUDA-Thread-Indexing-Cheatsheet.pdf

# Para una GTX 1080
# Maximum number of threads per block:           1024
# Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
# Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)


mod = SourceModule("""
   __global__ void doublify(int *a)
   {
  
     int blockId = blockIdx.x + blockIdx.y * gridDim.x;
     int idx = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;

     a[idx] *= 2;
   }
   """)

#Convierte el kernel a una función en python
func = mod.get_function("doublify")
#Se ejecuta el kernel, con bloques de tamaño 4x4x1

start = cuda.Event()
end   = cuda.Event()

start.record()
func(a_gpu, block=(16,16,1),grid=(2,2,1))
end.record()
end.synchronize()

millis = start.time_till(end)
print("Tiempo: " + str(millis))
#Devuelve un nuevo arreglo con la misma forma y tipo del arreglo dado
a_doubled = numpy.empty_like(a)

#Se realiza la copia de los datos de device -> host
cuda.memcpy_dtoh(a_doubled, a_gpu)

print(a)
print()
print(a_doubled)
print()


Tiempo: 0.088128000497818
[[9 9 5 6 5 0 6 7]
 [1 1 4 4 0 3 6 3]
 [0 9 1 5 1 6 6 8]
 [6 4 1 4 7 3 7 5]
 [8 2 0 7 3 6 9 7]
 [8 9 5 5 3 8 9 6]
 [3 2 3 7 2 9 1 9]
 [4 2 6 8 7 0 7 8]]

[[18 18 10 12 10  0 12 14]
 [ 2  2  8  8  0  6 12  6]
 [ 0 18  2 10  2 12 12 16]
 [12  8  2  8 14  6 14 10]
 [16  4  0 14  6 12 18 14]
 [16 18 10 10  6 16 18 12]
 [ 6  4  6 14  4 18  2 18]
 [ 8  4 12 16 14  0 14 16]]

