<a href="https://colab.research.google.com/github/m-pandey5/CUDA/blob/main/Unified_memory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Unified memory use  cudaMallocManaged() where we don't have to use Memcpy for device and host seprately. The one thing to remember with unified memory is that you still need cudaDeviceSynchronize() after kernel launches to ensure all GPU operations are complete before accessing the data on the CPU side

In [16]:
%%writefile unified.cu
#include <stdio.h>
#include <cassert>
#include <iostream>
using std::cout;
//CUDA kernel for vector addition
__global__ void vectorAdd(int*a , int *b, int*c , int N){
  // global thread include
  int tid = (blockDim.x*blockIdx.x)+ threadIdx.x;
  //Boundary check
  if (tid<N){
    c[tid]= a[tid] + b[tid];
  }

}
int main(){
  const int N = 1<<16;
  size_t bytes = N* sizeof(int);
  //unified memory pointers
  int *a , *b , *c;
  // Allocation memory for these pointers
  cudaMallocManaged(&a, bytes);
  cudaMallocManaged(&b , bytes);
  cudaMallocManaged(&c, bytes);
  // intialise the vector
  for (int i =0; i<N; i++){
    a[i]= rand()% 100;
    b[i]= rand()% 100;  }
    // 1024 threads per block
    int block_size= 1<<10;
    // no. of block per grid
    int grid_size = (N+block_size-1)/ block_size;
    vectorAdd<<<grid_size,block_size>>>(a,b,c,N);
    // as there is no memcpy so we have to synchronise the thread
    cudaDeviceSynchronize();
    // Print a few elements from each array with printf
printf("First 5 elements of arrays:\n");
for (int i = 0; i < 5; i++) {
    printf("a[%d] = %d, b[%d] = %d, c[%d] = %d\n", i, a[i], i, b[i], i, c[i]);
}

printf("Last 5 elements of arrays:\n");
for (int i = N - 5; i < N; i++) {
    printf("a[%d] = %d, b[%d] = %d, c[%d] = %d\n", i, a[i], i, b[i], i, c[i]);
}
    // verify the result on the CPU
    for (int i =0; i<N; i++){
      assert (c[i]==a[i]+b[i]);
    }
    //free the unified memory
    cudaFree(a);
    cudaFree(b);
    cudaFree(c);
    cout<<"Completed successfully";
    return 0;
}



Overwriting unified.cu


In [17]:
!nvcc unified.cu -o unified

In [18]:
!nvcc -arch=sm_75 unified.cu -o unified

In [19]:
!./unified

First 5 elements of arrays:
a[0] = 83, b[0] = 86, c[0] = 169
a[1] = 77, b[1] = 15, c[1] = 92
a[2] = 93, b[2] = 35, c[2] = 128
a[3] = 86, b[3] = 92, c[3] = 178
a[4] = 49, b[4] = 21, c[4] = 70
Last 5 elements of arrays:
a[65531] = 45, b[65531] = 67, c[65531] = 112
a[65532] = 21, b[65532] = 68, c[65532] = 89
a[65533] = 98, b[65533] = 20, c[65533] = 118
a[65534] = 86, b[65534] = 62, c[65534] = 148
a[65535] = 18, b[65535] = 67, c[65535] = 85
Completed successfully