In [None]:
%%writefile Parallel.cu
#include <stdio.h>
#include <malloc.h>
#include <cuda.h>
#define M 10
#define Nt 10
#define dxy 0.1
#define dt 0.01
#define GridSize 10
#define BlockSize 10
#define ThreadSize M*M/(GridSize*BlockSize) 
//==============================
float funct1(int i, int j) {
  return sin(M_PI*(i*dxy + j*dxy));
}
float funct2(int i, int j) {
  return cos(M_PI*(i*dxy + j*dxy));
}
//==============================
void InputData(float *U, float *V, float *C) {
  int i,j,t;
  for (j=0; j<M; j++) {
    for (i=0; i<M; i++) {
      *(U + j*M + i) = funct1(i,j);
      *(V + j*M + i) = dt*funct2(i,j) + *(U + j*M + i);
    }
  }
  for (t=0; t<Nt; t++) {
    *(C+t) = 1/2;
  }
}
//===========================
__global__ void Computing(float *A, float *B, float *C, int t){
  float left, right, bottom, top, center1, center2;
  int l, index, start, stop;
  index = blockIdx.x * blockDim.x + threadIdx.x;
  start = index*ThreadSize;
  stop  = start + ThreadSize;
  for (l=start; l<stop; l++) {
      left  =  (l%M == 0)       ? 0 : *(B + l - 1);
      right =  (l%M == M-1)    ? 0 : *(B + l + 1);
      bottom = (l >= M*(M-1))  ? 0 : *(B + l - M);
      top   =  (l <= M-1)       ? 0 : *(B + l + M);
      center1 = *(B + l);
      center2 = *(A + l);
      *(A + l) = 2*center1 - center2 + ((dt * dt * (*(C + t)) * (*(C + t))) / (dxy * dxy)) * (left + right + bottom + top - 4*center1);
    }
    __syncthreads();
}
//===========================
int main() {
  float *UCPU, *VCPU, *CCPU;
  UCPU = (float *) malloc (M*M*sizeof(float));
  VCPU = (float *) malloc (M*M*sizeof(float));
  CCPU = (float *) malloc (Nt*sizeof(float));
  InputData(UCPU, VCPU, CCPU);

  // Delare and Allocate Mem on GPU
  float *UGPU, *VGPU, *CGPU;
  cudaMalloc((void**)&UGPU ,M*M*sizeof(float));
  cudaMalloc((void**)&VGPU ,M*M*sizeof(float));
  cudaMalloc((void**)&CGPU ,Nt*sizeof(float));

  // Copy Input from CPU to GPU
  cudaMemcpy(UGPU,UCPU,M*M*sizeof(float),cudaMemcpyHostToDevice);
  cudaMemcpy(VGPU,VCPU,M*M*sizeof(float),cudaMemcpyHostToDevice);
  cudaMemcpy(CGPU,CCPU,Nt*sizeof(float),cudaMemcpyHostToDevice);

  //Define Block and Thread Structure
  dim3 dimGrid(GridSize);
  dim3 dimBlock(BlockSize);

  //Computing
  for (int t=2; t<Nt; t++) {
    if (t%2==0) Computing<<<dimGrid,dimBlock>>>(UGPU, VGPU, CGPU, t);
    else Computing<<<dimGrid,dimBlock>>>(VGPU, UGPU, CGPU, t);
  }

  //Copy Output from GPU to CPU
  cudaMemcpy(UCPU, UGPU, M*M*sizeof(float), cudaMemcpyDeviceToHost);
  cudaMemcpy(VCPU, VGPU, M*M*sizeof(float), cudaMemcpyDeviceToHost);
  cudaMemcpy(CCPU, CGPU, Nt*sizeof(float), cudaMemcpyDeviceToHost);

  //Show results
  if (Nt % 2 == 1) {
    for (int j = 0; j < M; j++) {
      for (int i = 0; i < M; i++) {
        printf("%f ", *(UCPU + j * M + i));
      }
      printf("\n");
    }
  } else {
    for (int j = 0; j < M; j++) {
      for (int i = 0; i < M; i++) {
        printf("%f ", *(VCPU + j * M + i));
      }
      printf("\n");
    }
  }

  // Free Mem on CPU and GPU
  free(UCPU); free(VCPU); free(CCPU);
  cudaFree(UGPU); cudaFree(VGPU); cudaFree(CGPU);
  return 0;
}

In [None]:
!nvcc Parallel.cu -lm

In [None]:
!./a.out