In [1]:
!nvidia-smi

Wed Feb  5 23:34:05 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [38]:
%%writefile vector_addition.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda_runtime.h>

__global__ void initWith(float num, float *a, int N) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;

    for (int i = idx; i < N; i += stride) {
        a[i] = num;
    }
}

__global__ void addVectorsInto(float *result, float *a, float *b, int N) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;

    for (int i = idx; i < N; i += stride) {
        result[i] = a[i] + b[i];
    }
}

void test(float target, float *array, int N) {
    for (int i = 0; i < N; i++) {
        if (array[i] != target) {
            printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);
            exit(1);
        }
    }
    printf("SUCCESS! All values added correctly.\n");
}

int main() {
    const int N = 1 << 26; // 2^26 elements
    size_t size = N * sizeof(float);

    // Allocate host memory
    float *h_a = (float *)malloc(size);
    float *h_b = (float *)malloc(size);
    float *h_c = (float *)malloc(size);

    size_t num_threads = 1024;
    size_t num_blocks = (N + num_threads - 1) / num_threads;

    // Allocate device memory
    float *d_a, *d_b, *d_c;
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);


    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);


    clock_t start, end;
    start = clock();


    initWith<<<num_blocks, num_threads>>>(3, d_a, N);
    initWith<<<num_blocks, num_threads>>>(4, d_b, N);
    initWith<<<num_blocks, num_threads>>>(0, d_c, N);


    addVectorsInto<<<num_blocks, num_threads>>>(d_c, d_a, d_b, N);


    cudaDeviceSynchronize();
    end = clock();


    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    // Verify correctness
    test(7, h_c, N);


    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("CUDA Error: %s\n", cudaGetErrorString(err));
    }


    float time2 = ((float)(end - start)) / CLOCKS_PER_SEC;
    printf("CUDA: %f seconds\n", time2);


    free(h_a);
    free(h_b);
    free(h_c);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}


Overwriting vector_addition.cu


In [32]:
!nvcc vector_addition.cu -o abc -arch=sm_75

In [33]:
!./abc

SUCCESS! All values added correctly.
CUDA: 0.068715 seconds


In [39]:
%%writefile vector_addition_cpu.cpp

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

void initWith(float num, float *a, int N)
{
    for (int i = 0; i < N; i++)
    {
        a[i] = num;
    }
}

void addVectorsInto(float *result, float *a, float *b, int N)
{
    for (int i = 0; i < N; i++)
    {
        result[i] = a[i] + b[i];
    }
}

void test(float target, float *array, int N)
{
    for (int i = 0; i < N; i++)
    {
        if (array[i] != target)
        {
            printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);
            exit(1);
        }
    }
    printf("SUCCESS! All values added correctly.\n");
}

int main()
{
    const int N = 2 << 26;
    size_t size = N * sizeof(float);

    float *a = (float *)malloc(size);
    float *b = (float *)malloc(size);
    float *c = (float *)malloc(size);

    clock_t start, end;
    start = clock();

    initWith(3, a, N);
    initWith(4, b, N);
    initWith(0, c, N);

    addVectorsInto(c, a, b, N);

    end = clock();
    float time2 = ((float)(end - start)) / CLOCKS_PER_SEC;


    test(7, c, N);
    printf("CPU: %f seconds\n", time2);



    free(a);
    free(b);
    free(c);

    return 0;
}


Writing vector_addition_cpu.cpp


In [42]:
!g++ vector_addition_cpu.cpp -o cpu_code

In [43]:
!./cpu_code

SUCCESS! All values added correctly.
CPU: 1.849633 seconds
