In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/DD2360/Assignment4/question1

/content/drive/MyDrive/DD2360/Assignment4/question1


In [None]:
%%writefile vectorAdd.cu
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>

#define DataType double

// Vector addition kernel
__global__ void vecAdd(DataType *in1, DataType *in2, DataType *out, int len) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < len) {
        out[index] = in1[index] + in2[index];
    }
}

double CPUtimer() {
    struct timeval ti;
    gettimeofday(&ti, NULL);
    return ((double)ti.tv_sec + (double)ti.tv_usec * 1e-6);
}

int main(int argc, char **argv) {

    int inputLength;
    DataType *hostInput1;
    DataType *hostInput2;
    DataType *hostOutput;
    DataType *resultRef;
    DataType *deviceInput1;
    DataType *deviceInput2;
    DataType *deviceOutput;
    bool flag = true;
    double start, end, duration;

    if (argc != 3) {
        printf("Usage: %s <inputLength> <S_seg>\n", argv[0]);
        exit(EXIT_FAILURE);
    }

    inputLength = atoi(argv[1]);
    int S_seg = atoi(argv[2]);
    const int numSegments = ceil(inputLength / S_seg);

    printf("The input length is %d\n", inputLength);
    printf("The size of segments is %d\n", S_seg);

    hostInput1 = (DataType *)malloc(inputLength * sizeof(DataType));
    hostInput2 = (DataType *)malloc(inputLength * sizeof(DataType));
    hostOutput = (DataType *)malloc(inputLength * sizeof(DataType));
    resultRef = (DataType *)malloc(inputLength * sizeof(DataType));

    for (int i = 0; i < inputLength; ++i) {
        hostInput1[i] = (DataType)rand() / RAND_MAX;
        hostInput2[i] = (DataType)rand() / RAND_MAX;
        resultRef[i] = hostInput1[i] + hostInput2[i];
    }

    cudaMalloc((void **)&deviceInput1, inputLength * sizeof(DataType));
    cudaMalloc((void **)&deviceInput2, inputLength * sizeof(DataType));
    cudaMalloc((void **)&deviceOutput, inputLength * sizeof(DataType));

    // Number of streams
    cudaStream_t streams[numSegments];

    // Create streams
    for (int i = 0; i < numSegments; ++i) {
        cudaStreamCreate(&streams[i]);
    }

    start = CPUtimer();


    dim3 dimGrid(ceil(S_seg / 256));
    dim3 dimBlock(256);


    for(int i = 0; i < numSegments; i++) {
        int offset = i * S_seg;
        cudaMemcpyAsync(deviceInput1 + offset, hostInput1 + offset, S_seg * sizeof(DataType), cudaMemcpyHostToDevice, streams[i]);
        cudaMemcpyAsync(deviceInput2 + offset, hostInput2 + offset, S_seg * sizeof(DataType), cudaMemcpyHostToDevice, streams[i]);
        vecAdd<<<dimGrid, dimBlock, 0, streams[i]>>>(deviceInput1 + offset, deviceInput2 + offset, deviceOutput + offset, S_seg);
        cudaMemcpyAsync(hostOutput + offset, deviceOutput + offset, S_seg * sizeof(DataType), cudaMemcpyDeviceToHost, streams[i]);
    }

    cudaDeviceSynchronize();


    end = CPUtimer();
    duration = end - start;
    printf( "Total time : %f.\n", duration);


    // Destroy streams
    for (int i = 0; i < numSegments; ++i) {
        cudaStreamDestroy(streams[i]);
    }

    // Free GPU memory
    cudaFree(deviceInput1);
    cudaFree(deviceInput2);
    cudaFree(deviceOutput);

    // Free CPU memory
    free(hostInput1);
    free(hostInput2);
    free(hostOutput);
    free(resultRef);

        // Compare the output with the reference
    for (int i = 0; i < numSegments; ++i) {
        if (fabs(hostOutput[i] - resultRef[i]) > 1e-5) {
            printf("Mismatch at index %d: Host %f, GPU %f\n", i, resultRef[i], hostOutput[i]);
            flag = false;
            break;
        }
    }

    if (flag == true) {
        printf("Two vectors are the same\n");
    }



    return 0;
}


Overwriting vectorAdd.cu


In [None]:
!nvcc vectorAdd.cu
!ls

a.out		       profile_mycode.nvvp  test     trace_output.nvvp
profile_mycode.nvprof  profile_vecadd.nvvp  test.cu  vectorAdd.cu


In [None]:
!./a.out 1024 256

The input length is 1024
The size of segments is 256
Total time : 0.000734.
Two vectors are the same


In [None]:
!./a.out 2048 512

The input length is 2048
The size of segments is 512
Total time : 0.000761.
Two vectors are the same


In [None]:
!./a.out 4096 1024

The input length is 4096
The size of segments is 1024
Total time : 0.000858.
Two vectors are the same


In [None]:
!/usr/local/cuda-12/bin/nvprof --output-profile profile_vecadd.nvvp -f ./a.out 262144 1024

The input length is 262144
The size of segments is 1024
==8224== NVPROF is profiling process 8224, command: ./a.out 262144 1024
Total time : 0.010894.
==8224== Generated result file: /content/drive/MyDrive/DD2360/Assignment4/question1/profile_vecadd.nvvp


In [None]:
!ncu --set default --metrics sm__warps_active.avg.pct_of_peak_sustained_active ./a.out 1024 256

The input length is 1024
The size of segments is 256
==PROF== Connected to process 2287 (/content/drive/MyDrive/DD2360/Assignment4/question1/a.out)
==PROF== Profiling "vecAdd" - 0: 0%....50%....100% - 1 pass
==PROF== Profiling "vecAdd" - 1: 0%....50%....100% - 1 pass
==PROF== Profiling "vecAdd" - 2: 0%....50%....100% - 1 pass
==PROF== Profiling "vecAdd" - 3: 0%....50%....100% - 1 pass
Total time : 0.383361.
Two vectors are the same
==ERROR== The application returned an error code (11).
[2287] a.out@127.0.0.1
  vecAdd(double *, double *, double *, int) (1, 1, 1)x(256, 1, 1), Context 1, Stream 13, Device 0, CC 7.5
    Section: Command line profiler metrics
    ------------------------------------------------- ----------- ------------
    Metric Name                                       Metric Unit Metric Value
    ------------------------------------------------- ----------- ------------
    sm__warps_active.avg.pct_of_peak_sustained_active           %        23.95
    -----------------

In [None]:
!./a.out 262144 256

The input length is 262144
The size of segments is 256
Total time : 0.030408.
Two vectors are the same


In [None]:
!./a.out 262144 512

The input length is 262144
The size of segments is 512
Total time : 0.017020.
Two vectors are the same


In [None]:
!./a.out 262144 1024

The input length is 262144
The size of segments is 1024
Total time : 0.010310.
Two vectors are the same


In [None]:
!./a.out 262144 2048

The input length is 262144
The size of segments is 2048
Total time : 0.007166.
Two vectors are the same


In [None]:
!./a.out 262144 4096

The input length is 262144
The size of segments is 4096
Total time : 0.005836.
Two vectors are the same


In [None]:
!./a.out 262144 8192

The input length is 262144
The size of segments is 8192
Total time : 0.004782.
Two vectors are the same


In [None]:
!./a.out 262144 16384

The input length is 262144
The size of segments is 16384
Total time : 0.004558.
Two vectors are the same


In [None]:
!./a.out 262144 32768

The input length is 262144
The size of segments is 32768
Total time : 0.003666.
Two vectors are the same


In [None]:
!./a.out 262144 65536

The input length is 262144
The size of segments is 65536
Total time : 0.003715.
Two vectors are the same
