# Notebook 03: Vector Addition on GPU
## Phase 1: Foundations - Thread Hierarchy & Kernel Basics

**Learning Objectives:**
- Understand memory allocation and deallocation on GPU
- Learn data transfer between host and device
- Implement parallel vector addition
- Calculate global thread indices correctly
- Compare CPU vs GPU performance

## Concept: Memory Management and Data Transfer

**CUDA Memory Management Functions:**
- `cudaMalloc()` - Allocate memory on GPU
- `cudaFree()` - Free GPU memory
- `cudaMemcpy()` - Copy data between host and device

**Memory Transfer Directions:**
- `cudaMemcpyHostToDevice` - CPU → GPU
- `cudaMemcpyDeviceToHost` - GPU → CPU
- `cudaMemcpyDeviceToDevice` - GPU → GPU

**Calculating Global Thread Index:**
```cuda
int idx = blockIdx.x * blockDim.x + threadIdx.x;
```

## Example 1: Simple Vector Addition

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>

__global__ void vectorAdd(float *a, float *b, float *c, int n) {
    // Calculate global thread index
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    // Boundary check
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    int n = 1000;
    size_t size = n * sizeof(float);
    
    // Allocate host memory
    float *h_a = (float*)malloc(size);
    float *h_b = (float*)malloc(size);
    float *h_c = (float*)malloc(size);
    
    // Initialize input vectors
    for (int i = 0; i < n; i++) {
        h_a[i] = i * 1.0f;
        h_b[i] = i * 2.0f;
    }
    
    // Allocate device memory
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);
    
    // Copy data to device
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
    
    // Launch kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
    printf("Launching kernel with %d blocks and %d threads per block\n",
           blocksPerGrid, threadsPerBlock);
    
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
    
    // Copy result back to host
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
    
    // Verify result
    bool correct = true;
    for (int i = 0; i < n; i++) {
        if (h_c[i] != h_a[i] + h_b[i]) {
            printf("Error at index %d: %f != %f\n", i, h_c[i], h_a[i] + h_b[i]);
            correct = false;
            break;
        }
    }
    
    if (correct) {
        printf("Vector addition successful!\n");
        printf("Sample results: %f + %f = %f\n", h_a[0], h_b[0], h_c[0]);
    }
    
    // Free memory
    free(h_a); free(h_b); free(h_c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    
    return 0;
}

## Example 2: Vector Addition with Error Checking

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
                   cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

__global__ void vectorAdd(float *a, float *b, float *c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    int n = 10000;
    size_t size = n * sizeof(float);
    
    float *h_a = (float*)malloc(size);
    float *h_b = (float*)malloc(size);
    float *h_c = (float*)malloc(size);
    
    for (int i = 0; i < n; i++) {
        h_a[i] = rand() / (float)RAND_MAX;
        h_b[i] = rand() / (float)RAND_MAX;
    }
    
    float *d_a, *d_b, *d_c;
    CUDA_CHECK(cudaMalloc(&d_a, size));
    CUDA_CHECK(cudaMalloc(&d_b, size));
    CUDA_CHECK(cudaMalloc(&d_c, size));
    
    CUDA_CHECK(cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice));
    
    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
    
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
    
    CUDA_CHECK(cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost));
    
    // Verify
    float maxError = 0.0f;
    for (int i = 0; i < n; i++) {
        float error = abs(h_c[i] - (h_a[i] + h_b[i]));
        if (error > maxError) maxError = error;
    }
    printf("Max error: %f\n", maxError);
    printf("Vector addition completed successfully!\n");
    
    free(h_a); free(h_b); free(h_c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    
    return 0;
}

## Example 3: CPU vs GPU Performance Comparison

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

__global__ void vectorAddGPU(float *a, float *b, float *c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

void vectorAddCPU(float *a, float *b, float *c, int n) {
    for (int i = 0; i < n; i++) {
        c[i] = a[i] + b[i];
    }
}

int main() {
    int n = 10000000;  // 10 million elements
    size_t size = n * sizeof(float);
    
    printf("Vector size: %d elements (%.2f MB)\n", n, size / 1024.0 / 1024.0);
    
    float *h_a = (float*)malloc(size);
    float *h_b = (float*)malloc(size);
    float *h_c_cpu = (float*)malloc(size);
    float *h_c_gpu = (float*)malloc(size);
    
    for (int i = 0; i < n; i++) {
        h_a[i] = i * 1.0f;
        h_b[i] = i * 2.0f;
    }
    
    // CPU timing
    clock_t start_cpu = clock();
    vectorAddCPU(h_a, h_b, h_c_cpu, n);
    clock_t end_cpu = clock();
    double cpu_time = ((double)(end_cpu - start_cpu)) / CLOCKS_PER_SEC * 1000.0;
    
    // GPU timing
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);
    
    cudaEvent_t start_gpu, stop_gpu;
    cudaEventCreate(&start_gpu);
    cudaEventCreate(&stop_gpu);
    
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
    
    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
    
    cudaEventRecord(start_gpu);
    vectorAddGPU<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
    cudaEventRecord(stop_gpu);
    
    cudaMemcpy(h_c_gpu, d_c, size, cudaMemcpyDeviceToHost);
    
    cudaEventSynchronize(stop_gpu);
    float gpu_time = 0;
    cudaEventElapsedTime(&gpu_time, start_gpu, stop_gpu);
    
    // Verify results match
    bool match = true;
    for (int i = 0; i < n; i++) {
        if (h_c_cpu[i] != h_c_gpu[i]) {
            match = false;
            break;
        }
    }
    
    printf("\nResults:\n");
    printf("CPU Time: %.2f ms\n", cpu_time);
    printf("GPU Time: %.2f ms\n", gpu_time);
    printf("Speedup: %.2fx\n", cpu_time / gpu_time);
    printf("Results match: %s\n", match ? "YES" : "NO");
    
    free(h_a); free(h_b); free(h_c_cpu); free(h_c_gpu);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    cudaEventDestroy(start_gpu);
    cudaEventDestroy(stop_gpu);
    
    return 0;
}

## Example 4: Different Thread Block Sizes

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>

__global__ void vectorAdd(float *a, float *b, float *c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    int n = 1000000;
    size_t size = n * sizeof(float);
    
    float *h_a = (float*)malloc(size);
    float *h_b = (float*)malloc(size);
    float *h_c = (float*)malloc(size);
    
    for (int i = 0; i < n; i++) {
        h_a[i] = i * 1.0f;
        h_b[i] = i * 2.0f;
    }
    
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);
    
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
    
    // Test different block sizes
    int blockSizes[] = {32, 64, 128, 256, 512, 1024};
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    printf("Testing different block sizes:\n");
    printf("----------------------------------------\n");
    
    for (int i = 0; i < 6; i++) {
        int threadsPerBlock = blockSizes[i];
        int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
        
        cudaEventRecord(start);
        vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        
        float milliseconds = 0;
        cudaEventElapsedTime(&milliseconds, start, stop);
        
        printf("Block size %4d: %d blocks, Time: %.3f ms\n",
               threadsPerBlock, blocksPerGrid, milliseconds);
    }
    
    free(h_a); free(h_b); free(h_c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    return 0;
}

## Practical Exercise

**Exercise 1:** Implement vector subtraction (c[i] = a[i] - b[i])

**Exercise 2:** Implement vector scaling (c[i] = a[i] * scalar)

**Exercise 3:** Implement element-wise vector multiplication (c[i] = a[i] * b[i])

**Exercise 4:** Test with different data sizes and find the break-even point where GPU becomes faster than CPU

In [None]:
%%cu
// Your solution here
#include <stdio.h>
#include <stdlib.h>

__global__ void vectorOp(float *a, float *b, float *c, int n) {
    // TODO: Implement your operation
}

int main() {
    // TODO: Implement your solution
    
    return 0;
}

## Key Takeaways

1. **Global thread index** calculation: `blockIdx.x * blockDim.x + threadIdx.x`
2. Always perform **boundary checks** to avoid memory access errors
3. **cudaMalloc** allocates GPU memory, **cudaFree** releases it
4. **cudaMemcpy** transfers data between host and device
5. Calculate blocks as: `(n + threadsPerBlock - 1) / threadsPerBlock`
6. GPU shows speedup for sufficiently large data sizes
7. Different block sizes can affect performance

## Next Steps

In the next notebook, we'll learn how to:
- Work with 2D thread blocks and grids
- Perform matrix addition
- Calculate 2D thread indices
- Handle 2D data structures on GPU

Continue to: **04_matrix_add.ipynb**

## Notes

*Use this space to write your own notes and observations:*

---



---