#Memory hierarchy

##Notebook setup


In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>

int main() {
    int numDevs=0;
    cudaGetDeviceCount(&numDevs);
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    printf("Device Number: %d\n", 0);
    printf("  Device name: %s\n", prop.name);
    printf("  Memory Clock Rate (KHz): %d\n",
          prop.memoryClockRate);
    printf("  Memory Bus Width (bits): %d\n",
          prop.memoryBusWidth);
    printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
          2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
    printf("Num devices %d\n", numDevs);
    return 0;
}

#Device, Shared and Private memory

In [None]:
%%cuda --name test.cu

#include <iostream>

#define VECTOR_SIZE 128
#define BLOCK_SIZE 128

void __global__ nonSharedReverseOrder(int* input, int* output, int vectorSize) {
    // Vars input and output contain a memory addres that will be copied to each thread.
    // The address will reside in private memory, but it will point to Device memory.
    // Var vectorSize, contains a value that resides in private memory, and will
    // be copied to each thread.

    for (int i=0; i<vectorSize; ++i) {
        output[(vectorSize-1)-i] = input[i];
    }
}

extern __shared__ int sharedMemory[];
void __global__ sharedDynamicReverseOrder(int* input, int* output, int vectorSize) {
    int* input_sh = sharedMemory;
    int* output_sh = sharedMemory + blockDim.x;

    input_sh[threadIdx.x] = input[threadIdx.x];

    __syncthreads();

    if (threadIdx.x == 0) {
        for (int i=0; i<vectorSize; ++i) {
            output_sh[(vectorSize-1)-i] = input_sh[i];
        }
    }
    __syncthreads();

    output[threadIdx.x] = output_sh[threadIdx.x];
}

void __global__ sharedStaticReverseOrder(int* input, int* output, int vectorSize) {
    __shared__ int input_sh[BLOCK_SIZE];
    __shared__ int output_sh[BLOCK_SIZE];

    input_sh[threadIdx.x] = input[threadIdx.x];
    __syncthreads();

    if (threadIdx.x == 0) {
        for (int i=0; i<vectorSize; ++i) {
            output_sh[(vectorSize-1)-i] = input_sh[i];
        }
    }
    __syncthreads();

    output[threadIdx.x] = output_sh[threadIdx.x];
}

int main() {
    int *h_in, *h_out, *d_in, *d_out;

    // If we create the host memory with CUDA,
    // the transfers between GPU and CPU are faster
    cudaMallocHost(&h_in, VECTOR_SIZE*sizeof(int));
    cudaMallocHost(&h_out, VECTOR_SIZE*sizeof(int));

    cudaMalloc(&d_in, VECTOR_SIZE*sizeof(int));
    cudaMalloc(&d_out, VECTOR_SIZE*sizeof(int));

    for (int i=0; i<VECTOR_SIZE; ++i) {
        h_in[i] = i;
    }

    cudaStream_t stream;
    cudaStreamCreate(&stream);

    cudaMemcpyAsync(d_in, h_in, VECTOR_SIZE*sizeof(int), cudaMemcpyHostToDevice, stream);
    
    /*dim3 block(1);
    dim3 grid(1);
    nonSharedReverseOrder<<<grid, block, 0, stream>>>(d_in, d_out, VECTOR_SIZE);*/
    
    dim3 block(BLOCK_SIZE);
    dim3 grid(1);
    sharedStaticReverseOrder<<<grid, block, 0, stream>>>(d_in, d_out, VECTOR_SIZE);

    /*size_t sharedSize = BLOCK_SIZE * sizeof(int) * 2;
    dim3 block(BLOCK_SIZE);
    dim3 grid(1);
    sharedDynamicReverseOrder<<<grid, block, sharedSize, stream>>>(d_in, d_out, VECTOR_SIZE);*/

    cudaMemcpyAsync(h_out, d_out, VECTOR_SIZE*sizeof(int), cudaMemcpyDeviceToHost, stream);

    cudaStreamSynchronize(stream);

    std::cout << "Result: ";
    for (int i=0; i<VECTOR_SIZE; ++i) {
        std::cout << h_out[i] << " "; 
    }
    std::cout << std::endl;
}

In [None]:
!nvcc /content/src/test.cu -o test

In [None]:
!./test

In [None]:
!nvprof ./test

#Multi Thread block version


In [None]:
%%cuda --name testBig.cu

#include <iostream>

#define VECTOR_SIZE 100000000
#define BLOCK_SIZE 128

void __global__ nonSharedReverseOrderBig(int* input, int* output, int vectorSize) {
    // Compute the input global index taking into account the blockId
    // Take also into account that we are only usind a thread per block
    int inputGIdx = blockIdx.x * BLOCK_SIZE;

    for (int i=inputGIdx; i<(inputGIdx + BLOCK_SIZE) && i<vectorSize; ++i) {
        output[(vectorSize - 1) - i] = input[i];
    }
}

extern __shared__ int sharedMemory[];
void __global__ sharedDynamicReverseOrderBig(int* input, int* output, int vectorSize, int modulus) {
    int* input_sh = sharedMemory;
    int* output_sh = sharedMemory + blockDim.x;

    // Compute the input global index, taking into account the thread block
    int inputGIdx = threadIdx.x + (blockIdx.x * blockDim.x);

    if (inputGIdx < vectorSize) {
      input_sh[threadIdx.x] = input[inputGIdx];
    }
    __syncthreads();
    if (threadIdx.x == 0) {
        for (int i=0; i < BLOCK_SIZE && (inputGIdx + i) < vectorSize; ++i) {
            output_sh[(BLOCK_SIZE - 1) - i] = input_sh[i];
        }
    }
    __syncthreads();

    
    // Compute the output global index, taking into account the reverse order
    // of the thread blocks
    int outputGIdx = threadIdx.x + (((gridDim.x - 1) - blockIdx.x) * blockDim.x);
    if (blockIdx.x != gridDim.x -1) {
        outputGIdx -= modulus;
        output[outputGIdx] = output_sh[threadIdx.x];
    } else {
        if (outputGIdx >= modulus) {
          output[outputGIdx - modulus] = output_sh[threadIdx.x];
        }
    }
}

int main() {
    int *h_in, *h_out, *d_in, *d_out;

    // If we create the host memory with CUDA,
    // the transfers between GPU and CPU are faster
    cudaMallocHost(&h_in, VECTOR_SIZE*sizeof(int));
    cudaMallocHost(&h_out, VECTOR_SIZE*sizeof(int));

    cudaMalloc(&d_in, VECTOR_SIZE*sizeof(int));
    cudaMalloc(&d_out, VECTOR_SIZE*sizeof(int));

    for (int i=0; i<VECTOR_SIZE; ++i) {
        h_in[i] = i;
    }

    cudaStream_t stream;
    cudaStreamCreate(&stream);

    cudaMemcpyAsync(d_in, h_in, VECTOR_SIZE*sizeof(int), cudaMemcpyHostToDevice, stream);

    dim3 block(1);
    dim3 grid(ceil(VECTOR_SIZE / (float)BLOCK_SIZE));
    nonSharedReverseOrderBig<<<grid, block, 0, stream>>>(d_in, d_out, VECTOR_SIZE);

    /*size_t sharedSize = BLOCK_SIZE * sizeof(int) * 2;
    dim3 block(BLOCK_SIZE);
    dim3 grid(ceil(VECTOR_SIZE/(float)BLOCK_SIZE));
    int modulus = BLOCK_SIZE - (VECTOR_SIZE % BLOCK_SIZE);
    sharedDynamicReverseOrderBig<<<grid, block, sharedSize, stream>>>(d_in, d_out, VECTOR_SIZE, modulus);*/

    cudaMemcpyAsync(h_out, d_out, VECTOR_SIZE*sizeof(int), cudaMemcpyDeviceToHost, stream);

    cudaStreamSynchronize(stream);

    std::cout << "Result: ";
    for (int i=0; i<VECTOR_SIZE; ++i) {
        std::cout << h_out[i] << " "; 
    }
    std::cout << std::endl;
}

In [None]:
!nvcc /content/src/testBig.cu -o testBig

In [None]:
!./testBig

In [None]:
!nvprof ./testBig