# Notebook 13: Warp-Level Primitives and Shuffle Operations## Phase 3: Optimization Fundamentals**Learning Objectives:**- Understand __shfl operations- Learn warp-level communication- Master reduction- Apply concepts in practical scenarios- Measure and analyze performance

## Concept: Warp-Level Primitives and Shuffle Operations**Topics Covered:**- __shfl operations- warp-level communication- reduction**Key Concepts:**This notebook covers __shfl operations in the context of Phase 3: Optimization Fundamentals.

## Example 1: Basic Warp-Level Primitives and Shuffle Operations

In [None]:
%%cu#include <stdio.h>#include <cuda_runtime.h>#define CUDA_CHECK(call) \    do { \        cudaError_t err = call; \        if (err != cudaSuccess) { \            fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(err)); \            exit(EXIT_FAILURE); \        } \    } while(0)__global__ void warpReduceKernel(float *input, float *output, int n) {    int idx = blockIdx.x * blockDim.x + threadIdx.x;    int lane = threadIdx.x % 32;    int warpId = threadIdx.x / 32;    float val = (idx < n) ? input[idx] : 0.0f;    for (int offset = 16; offset > 0; offset /= 2) {        val += __shfl_down_sync(0xffffffff, val, offset);    }    if (lane == 0) {        output[blockIdx.x * (blockDim.x / 32) + warpId] = val;    }}int main() {    printf("=== Warp Shuffle ===\n\n");    int n = 1 << 20;    float *d_input, *d_output;    CUDA_CHECK(cudaMalloc(&d_input, n * sizeof(float)));    CUDA_CHECK(cudaMalloc(&d_output, (n/32) * sizeof(float)));    int threads = 256, blocks = (n + threads - 1) / threads;    cudaEvent_t start, stop;    cudaEventCreate(&start);    cudaEventCreate(&stop);    cudaEventRecord(start);    warpReduceKernel<<<blocks, threads>>>(d_input, d_output, n);    cudaEventRecord(stop);    cudaEventSynchronize(stop);    float ms;    cudaEventElapsedTime(&ms, start, stop);    printf("Time: %.2f ms\n", ms);    cudaFree(d_input);    cudaFree(d_output);    cudaEventDestroy(start);    cudaEventDestroy(stop);    return 0;}

## Practical ExerciseComplete the following exercises to practice the concepts learned.

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>

__global__ void reductionOptimized(float *input, float *output, int n) {
    __shared__ float sdata[256];

    int tid = threadIdx.x;
    int idx = blockIdx.x * (blockDim.x * 2) + threadIdx.x;

    // Load data with grid-stride and add during load
    sdata[tid] = 0;
    if (idx < n) sdata[tid] = input[idx];
    if (idx + blockDim.x < n) sdata[tid] += input[idx + blockDim.x];
    __syncthreads();

    // Sequential addressing (no divergence)
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    if (tid == 0) {
        output[blockIdx.x] = sdata[0];
    }
}

int main() {
    int n = 1000000;
    size_t size = n * sizeof(float);

    printf("=== Optimized Parallel Reduction ===\n\n");

    float *h_input = (float*)malloc(size);
    for (int i = 0; i < n; i++) h_input[i] = 1.0f;

    float *d_input, *d_output;
    cudaMalloc(&d_input, size);

    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock * 2 - 1) / (threadsPerBlock * 2);
    cudaMalloc(&d_output, blocksPerGrid * sizeof(float));

    cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    reductionOptimized<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_output, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    float *h_output = (float*)malloc(blocksPerGrid * sizeof(float));
    cudaMemcpy(h_output, d_output, blocksPerGrid * sizeof(float), cudaMemcpyDeviceToHost);

    float sum = 0;
    for (int i = 0; i < blocksPerGrid; i++) {
        sum += h_output[i];
    }

    printf("Sum: %.0f (expected: %d)\n", sum, n);
    printf("Time: %.3f ms\n", milliseconds);
    printf("Result: %s\n\n", (sum == n) ? "CORRECT" : "INCORRECT");
    printf("OPTIMIZATION: Sequential addressing avoids warp divergence!\n");

    free(h_input); free(h_output);
    cudaFree(d_input); cudaFree(d_output);
    cudaEventDestroy(start); cudaEventDestroy(stop);

    return 0;
}

## Key Takeaways

1. Warp shuffle allows intra-warp communication
2. No shared memory required
3. Lower latency than shared memory
4. Operations: __shfl_sync, __shfl_down_sync, __shfl_up_sync

## Next StepsContinue to: **14_next_topic.ipynb**

## Notes*Use this space to write your own notes and observations:*------