In [None]:
import numpy as np
import matplotlib.pyplot as plt

import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit



cuda_code =  SourceModule("""
     #include <stdio.h>

   __device__ inline int pbc(int x, int L);
   
   __global__ void diffusion(double *arr_in, 
                             double *arr_out,
                             double D,
                             int nr, 
                             int nc)
    {
      
      int i = threadIdx.x+ blockIdx.x* blockDim.x;
      int j = threadIdx.y+ blockIdx.y* blockDim.y;
      //arr_out[j + i*nc] = 2.0;

      int n = pbc(i-1,nr);
      int s = pbc(i+1,nr);
      int w = pbc(j-1,nc);
      int e = pbc(j+1,nc);

      arr_out[i*nc+j] =  arr_in[i*nc+j] +
                         D*(arr_in[n*nc+j] + arr_in[s*nc+j] +
                            arr_in[i*nc+w] + arr_in[i*nc+e] -
                            4.0*arr_in[i*nc+j]);
    }

    __device__ inline int pbc(int x, int L)
    {
        return  x - (int)floor((double)x/L)*L;
    }

""")



def plot_heat_map(matrix):
  """
      Show the array as heatmap
      
      Input:   matrix = Input 2d matrix
  """
  # Show the array
  fig = plt.figure()
  hm=plt.imshow(matrix, interpolation ='none', aspect = 'auto',  vmin=0.0, vmax=5.0)
  #hm=plt.imshow(matrix, interpolation ='none', aspect = 'auto')
  plt.colorbar(hm)
  plt.axes().set_aspect(1.0)
  plt.show()   



def initilise_array(arr, h, w, r):
  """
  
  """
  wh2 = w//2
  hh2 = h//2
  for i in range(0,h):
    for j in range(0,w):
      x=i-hh2
      y=j-wh2
      if(np.sqrt(x**2 + y**2)<r):
        arr[i,j] = 5



def diffusion(w, h, D, maxtime):

  # We need to transform ints to numpy int to be readable by CUDA
  w=np.int32(w)
  h=np.int32(h)

  laplacian = cuda_code.get_function("diffusion")

  # Data in the computer + initilisation
  arr = np.zeros((h,w), dtype=np.float64)
  arr_upd = np.empty_like(arr)
  initilise_array(arr, h, w, 60)
  plot_heat_map(arr)
  
 
  # Allocate memory for the array
  arr_gpu     = cuda.mem_alloc(arr.nbytes) 
  arr_upd_gpu = cuda.mem_alloc(arr.nbytes) 
  cuda.memcpy_htod(arr_gpu, arr)

  # Execute the function on the GPU 
  bw=32
  bh=32
  gw=int(np.floor(w/bw))
  gh=int(np.floor(h/bh))

  for t in range(0, maxtime):
    # Send data to GPU
    cuda.memcpy_htod(arr_gpu, arr)

    # Execute the function on the GPU 
    laplacian(arr_gpu, arr_upd_gpu, D, h, w, block=(bw,bh,1), grid=(gw,gh))
    
    # Retrieve the data from the GPU
    cuda.memcpy_dtoh(arr, arr_upd_gpu)

  plot_heat_map(arr)


D = np.float64(0.1)
w = 512
h = 512
maxtime = 10000

diffusion(w, h, D, maxtime);