# Notebook 21: Atomic Operations Patterns## Phase 4: Advanced Memory & Synchronization**Learning Objectives:**- Understand atomic operations- Learn atomicAdd- Master atomicCAS- Apply concepts in practical scenarios- Measure and analyze performance

## Concept: Atomic Operations Patterns**Topics Covered:**- atomic operations- atomicAdd- atomicCAS- thread-safety**Key Concepts:**This notebook covers atomic operations in the context of Phase 4: Advanced Memory & Synchronization.

## Example 1: Basic Atomic Operations Patterns

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <math.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)


__global__ void atomicAddKernel(int* counter, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) atomicAdd(counter, 1);
}

__global__ void atomicMinMaxKernel(const int* data, int* min_val, int* max_val, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        atomicMin(min_val, data[idx]);
        atomicMax(max_val, data[idx]);
    }
}

int main() {
    printf("=== Atomic Operations ===\n\n");
    const int N = 1 << 20;

    printf("Test 1: atomicAdd\n");
    int *d_counter;
    CUDA_CHECK(cudaMalloc(&d_counter, sizeof(int)));
    CUDA_CHECK(cudaMemset(d_counter, 0, sizeof(int)));

    atomicAddKernel<<<(N + 255) / 256, 256>>>(d_counter, N);

    int h_counter;
    CUDA_CHECK(cudaMemcpy(&h_counter, d_counter, sizeof(int), cudaMemcpyDeviceToHost));
    printf("  Counter: %d (expected %d) - %s\n\n", h_counter, N,
           (h_counter == N) ? "PASS" : "FAIL");

    printf("Test 2: atomicMin/Max\n");
    int *h_data = (int*)malloc(N * sizeof(int));
    for (int i = 0; i < N; i++) h_data[i] = rand() % 1000;

    int *d_data, *d_min, *d_max;
    CUDA_CHECK(cudaMalloc(&d_data, N * sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_min, sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_max, sizeof(int)));
    CUDA_CHECK(cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice));

    int init_min = 999999, init_max = -1;
    CUDA_CHECK(cudaMemcpy(d_min, &init_min, sizeof(int), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_max, &init_max, sizeof(int), cudaMemcpyHostToDevice));

    atomicMinMaxKernel<<<(N + 255) / 256, 256>>>(d_data, d_min, d_max, N);

    int h_min, h_max;
    CUDA_CHECK(cudaMemcpy(&h_min, d_min, sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(&h_max, d_max, sizeof(int), cudaMemcpyDeviceToHost));

    int ref_min = h_data[0], ref_max = h_data[0];
    for (int i = 1; i < N; i++) {
        if (h_data[i] < ref_min) ref_min = h_data[i];
        if (h_data[i] > ref_max) ref_max = h_data[i];
    }

    printf("  Min: %d (expected %d) - %s\n", h_min, ref_min,
           (h_min == ref_min) ? "PASS" : "FAIL");
    printf("  Max: %d (expected %d) - %s\n", h_max, ref_max,
           (h_max == ref_max) ? "PASS" : "FAIL");

    free(h_data);
    cudaFree(d_counter); cudaFree(d_data); cudaFree(d_min); cudaFree(d_max);
    return 0;
}


## Practical ExerciseComplete the following exercises to practice the concepts learned.

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

__global__ void kernel(float *data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = data[idx] * 2.0f;
    }
}

int main() {
    printf("=== Atomics ===\n\n");

    int n = 1000000;
    size_t size = n * sizeof(float);

    float *h_data = (float*)malloc(size);
    for (int i = 0; i < n; i++) h_data[i] = i;

    float *d_data;
    CUDA_CHECK(cudaMalloc(&d_data, size));
    CUDA_CHECK(cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int threads = 256;
    int blocks = (n + threads - 1) / threads;

    cudaEventRecord(start);
    kernel<<<blocks, threads>>>(d_data, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms;
    cudaEventElapsedTime(&ms, start, stop);

    CUDA_CHECK(cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost));

    printf("Processed %d elements in %.2f ms\n", n, ms);
    printf("Bandwidth: %.2f GB/s\n", (size * 2 / 1e9) / (ms / 1000.0));

    free(h_data);
    cudaFree(d_data);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

## Key Takeaways

1. Atomic operations ensure thread-safe updates
2. Useful for histograms, reductions, locks
3. Can create contention bottlenecks
4. Types: atomicAdd, atomicMax, atomicCAS

## Next StepsContinue to: **22_next_topic.ipynb**

## Notes*Use this space to write your own notes and observations:*------