# Notebook 16: Prefix Sum (Scan) Algorithms## Phase 3: Optimization Fundamentals**Learning Objectives:**- Understand inclusive scan- Learn exclusive scan- Master work-efficient scan- Apply concepts in practical scenarios- Measure and analyze performance

## Concept: Prefix Sum (Scan) Algorithms**Topics Covered:**- inclusive scan- exclusive scan- work-efficient scan**Key Concepts:**This notebook covers inclusive scan in the context of Phase 3: Optimization Fundamentals.

## Example 1: Basic Prefix Sum (Scan) Algorithms

In [None]:
%%cu#include <stdio.h>#include <cuda_runtime.h>#define CUDA_CHECK(call) \    do { \        cudaError_t err = call; \        if (err != cudaSuccess) { \            fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(err)); \            exit(EXIT_FAILURE); \        } \    } while(0)__global__ void scanKernel(float *data, int n) {    extern __shared__ float temp[];    int tid = threadIdx.x;    int idx = blockIdx.x * blockDim.x + threadIdx.x;    temp[tid] = (idx < n) ? data[idx] : 0.0f;    __syncthreads();    for (int stride = 1; stride < blockDim.x; stride *= 2) {        int index = (tid + 1) * stride * 2 - 1;        if (index < blockDim.x) temp[index] += temp[index - stride];        __syncthreads();    }    if (tid == 0) temp[blockDim.x - 1] = 0;    __syncthreads();    for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {        int index = (tid + 1) * stride * 2 - 1;        if (index < blockDim.x) {            float t = temp[index - stride];            temp[index - stride] = temp[index];            temp[index] += t;        }        __syncthreads();    }    if (idx < n) data[idx] = temp[tid];}int main() {    printf("=== Prefix Sum (Scan) ===\n\n");    int n = 1024;    float *d_data;    CUDA_CHECK(cudaMalloc(&d_data, n * sizeof(float)));    int threads = 256;    scanKernel<<<(n+threads-1)/threads, threads, threads*sizeof(float)>>>(d_data, n);    printf("Scan completed\n");    cudaFree(d_data);    return 0;}

## Practical ExerciseComplete the following exercises to practice the concepts learned.

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

__global__ void kernel(float *data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = data[idx] * 2.0f;
    }
}

int main() {
    printf("=== Prefix Sum ===\n\n");

    int n = 1000000;
    size_t size = n * sizeof(float);

    float *h_data = (float*)malloc(size);
    for (int i = 0; i < n; i++) h_data[i] = i;

    float *d_data;
    CUDA_CHECK(cudaMalloc(&d_data, size));
    CUDA_CHECK(cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int threads = 256;
    int blocks = (n + threads - 1) / threads;

    cudaEventRecord(start);
    kernel<<<blocks, threads>>>(d_data, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms;
    cudaEventElapsedTime(&ms, start, stop);

    CUDA_CHECK(cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost));

    printf("Processed %d elements in %.2f ms\n", n, ms);
    printf("Bandwidth: %.2f GB/s\n", (size * 2 / 1e9) / (ms / 1000.0));

    free(h_data);
    cudaFree(d_data);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

## Key Takeaways

1. Prefix sum (scan) computes cumulative sums
2. Two phases: up-sweep and down-sweep
3. Work-efficient algorithm important
4. Building block for many algorithms

## Next StepsContinue to: **17_next_topic.ipynb**

## Notes*Use this space to write your own notes and observations:*------