# Notebook 12: Warp Divergence and Branch Efficiency## Phase 3: Optimization Fundamentals**Learning Objectives:**- Understand warp execution- Learn branch divergence- Master performance impact- Apply concepts in practical scenarios- Measure and analyze performance

## Concept: Warp Divergence and Branch Efficiency**Topics Covered:**- warp execution- branch divergence- performance impact**Key Concepts:**This notebook covers warp execution in the context of Phase 3: Optimization Fundamentals.

## Example 1: Basic Warp Divergence and Branch Efficiency

In [None]:
%%cu#include <stdio.h>#include <cuda_runtime.h>#define CUDA_CHECK(call) \    do { \        cudaError_t err = call; \        if (err != cudaSuccess) { \            fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(err)); \            exit(EXIT_FAILURE); \        } \    } while(0)__global__ void divergentKernel(int *data, int n) {    int idx = blockIdx.x * blockDim.x + threadIdx.x;    if (idx < n) {        if (idx % 2 == 0) {            for (int i = 0; i < 100; i++) data[idx] += i;        } else {            data[idx] += 1;        }    }}__global__ void nonDivergentKernel(int *data, int n) {    int idx = blockIdx.x * blockDim.x + threadIdx.x;    if (idx < n) {        for (int i = 0; i < 100; i++) data[idx] += i;    }}int main() {    printf("=== Warp Divergence ===\n\n");    int n = 1 << 20;    int *d_data;    CUDA_CHECK(cudaMalloc(&d_data, n * sizeof(int)));    int threads = 256, blocks = (n + threads - 1) / threads;    cudaEvent_t start, stop;    cudaEventCreate(&start);    cudaEventCreate(&stop);    cudaEventRecord(start);    divergentKernel<<<blocks, threads>>>(d_data, n);    cudaEventRecord(stop);    cudaEventSynchronize(stop);    float divergentTime;    cudaEventElapsedTime(&divergentTime, start, stop);    cudaEventRecord(start);    nonDivergentKernel<<<blocks, threads>>>(d_data, n);    cudaEventRecord(stop);    cudaEventSynchronize(stop);    float nonDivergentTime;    cudaEventElapsedTime(&nonDivergentTime, start, stop);    printf("Divergent: %.2f ms\n", divergentTime);    printf("Non-divergent: %.2f ms\n", nonDivergentTime);    printf("Performance difference: %.1fx\n", divergentTime / nonDivergentTime);    cudaFree(d_data);    cudaEventDestroy(start);    cudaEventDestroy(stop);    return 0;}

## Practical ExerciseComplete the following exercises to practice the concepts learned.

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>

__global__ void reductionOptimized(float *input, float *output, int n) {
    __shared__ float sdata[256];

    int tid = threadIdx.x;
    int idx = blockIdx.x * (blockDim.x * 2) + threadIdx.x;

    // Load data with grid-stride and add during load
    sdata[tid] = 0;
    if (idx < n) sdata[tid] = input[idx];
    if (idx + blockDim.x < n) sdata[tid] += input[idx + blockDim.x];
    __syncthreads();

    // Sequential addressing (no divergence)
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    if (tid == 0) {
        output[blockIdx.x] = sdata[0];
    }
}

int main() {
    int n = 1000000;
    size_t size = n * sizeof(float);

    printf("=== Optimized Parallel Reduction ===\n\n");

    float *h_input = (float*)malloc(size);
    for (int i = 0; i < n; i++) h_input[i] = 1.0f;

    float *d_input, *d_output;
    cudaMalloc(&d_input, size);

    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock * 2 - 1) / (threadsPerBlock * 2);
    cudaMalloc(&d_output, blocksPerGrid * sizeof(float));

    cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    reductionOptimized<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_output, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    float *h_output = (float*)malloc(blocksPerGrid * sizeof(float));
    cudaMemcpy(h_output, d_output, blocksPerGrid * sizeof(float), cudaMemcpyDeviceToHost);

    float sum = 0;
    for (int i = 0; i < blocksPerGrid; i++) {
        sum += h_output[i];
    }

    printf("Sum: %.0f (expected: %d)\n", sum, n);
    printf("Time: %.3f ms\n", milliseconds);
    printf("Result: %s\n\n", (sum == n) ? "CORRECT" : "INCORRECT");
    printf("OPTIMIZATION: Sequential addressing avoids warp divergence!\n");

    free(h_input); free(h_output);
    cudaFree(d_input); cudaFree(d_output);
    cudaEventDestroy(start); cudaEventDestroy(stop);

    return 0;
}

## Key Takeaways

1. Warp shuffle allows intra-warp communication
2. No shared memory required
3. Lower latency than shared memory
4. Operations: __shfl_sync, __shfl_down_sync, __shfl_up_sync

## Next StepsContinue to: **13_next_topic.ipynb**

## Notes*Use this space to write your own notes and observations:*------