# Notebook 11: Memory Coalescing Demonstration## Phase 2: Memory Management - Shared Memory**Learning Objectives:**- Understand memory coalescing concept- Identify coalesced vs uncoalesced access- Measure performance impact- Learn access pattern optimization- Fix common coalescing issues

## Concept: Memory Coalescing**Coalesced Access:**- Consecutive threads access consecutive memory- Single memory transaction for warp- Maximum bandwidth utilization**Uncoalesced Access:**- Random or strided access patterns- Multiple memory transactions- Reduced bandwidth**Rules for Coalescing:**- Threads in warp access consecutive addresses- Aligned to segment size (32, 64, 128 bytes)- Within same cache line**Performance Impact:**- Coalesced: ~300 GB/s- Uncoalesced: ~30 GB/s (10x slower)

## Example 1: Basic Memory Coalescing Demonstration

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>

// Coalesced access: consecutive threads access consecutive memory
__global__ void coalescedAccess(float *data, float *result, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        result[idx] = data[idx] * 2.0f;
    }
}

// Strided access: non-coalesced, poor performance
__global__ void stridedAccess(float *data, float *result, int n, int stride) {
    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * stride;
    if (idx < n) {
        result[idx] = data[idx] * 2.0f;
    }
}

int main() {
    int n = 10000000;
    size_t size = n * sizeof(float);

    printf("=== Memory Coalescing Demonstration ===\n\n");

    float *d_data, *d_result;
    cudaMalloc(&d_data, size);
    cudaMalloc(&d_result, size);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;

    // Test coalesced access
    cudaEventRecord(start);
    coalescedAccess<<<blocksPerGrid, threadsPerBlock>>>(d_data, d_result, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float time_coalesced;
    cudaEventElapsedTime(&time_coalesced, start, stop);

    // Test strided access (stride = 32)
    blocksPerGrid = (n/32 + threadsPerBlock - 1) / threadsPerBlock;
    cudaEventRecord(start);
    stridedAccess<<<blocksPerGrid, threadsPerBlock>>>(d_data, d_result, n, 32);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float time_strided;
    cudaEventElapsedTime(&time_strided, start, stop);

    printf("Coalesced access: %.3f ms\n", time_coalesced);
    printf("Strided access (stride=32): %.3f ms\n", time_strided);
    printf("Performance degradation: %.2fx slower\n\n", time_strided / time_coalesced);

    float bandwidth_coalesced = (size * 2 / 1e9) / (time_coalesced / 1000.0);
    float bandwidth_strided = (size * 2 / 1e9) / (time_strided / 1000.0);

    printf("Coalesced bandwidth: %.2f GB/s\n", bandwidth_coalesced);
    printf("Strided bandwidth: %.2f GB/s\n\n", bandwidth_strided);

    printf("KEY INSIGHT: Adjacent threads should access adjacent memory!\n");

    cudaFree(d_data);
    cudaFree(d_result);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

## Practical ExerciseComplete the following exercises to practice the concepts learned.

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>

// Coalesced access: consecutive threads access consecutive memory
__global__ void coalescedAccess(float *data, float *result, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        result[idx] = data[idx] * 2.0f;
    }
}

// Strided access: non-coalesced, poor performance
__global__ void stridedAccess(float *data, float *result, int n, int stride) {
    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * stride;
    if (idx < n) {
        result[idx] = data[idx] * 2.0f;
    }
}

int main() {
    int n = 10000000;
    size_t size = n * sizeof(float);

    printf("=== Memory Coalescing Demonstration ===\n\n");

    float *d_data, *d_result;
    cudaMalloc(&d_data, size);
    cudaMalloc(&d_result, size);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;

    // Test coalesced access
    cudaEventRecord(start);
    coalescedAccess<<<blocksPerGrid, threadsPerBlock>>>(d_data, d_result, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float time_coalesced;
    cudaEventElapsedTime(&time_coalesced, start, stop);

    // Test strided access (stride = 32)
    blocksPerGrid = (n/32 + threadsPerBlock - 1) / threadsPerBlock;
    cudaEventRecord(start);
    stridedAccess<<<blocksPerGrid, threadsPerBlock>>>(d_data, d_result, n, 32);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float time_strided;
    cudaEventElapsedTime(&time_strided, start, stop);

    printf("Coalesced access: %.3f ms\n", time_coalesced);
    printf("Strided access (stride=32): %.3f ms\n", time_strided);
    printf("Performance degradation: %.2fx slower\n\n", time_strided / time_coalesced);

    float bandwidth_coalesced = (size * 2 / 1e9) / (time_coalesced / 1000.0);
    float bandwidth_strided = (size * 2 / 1e9) / (time_strided / 1000.0);

    printf("Coalesced bandwidth: %.2f GB/s\n", bandwidth_coalesced);
    printf("Strided bandwidth: %.2f GB/s\n\n", bandwidth_strided);

    printf("KEY INSIGHT: Adjacent threads should access adjacent memory!\n");

    cudaFree(d_data);
    cudaFree(d_result);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

## Key Takeaways

1. Coalesced access = adjacent threads access adjacent memory
2. Stride-1 access pattern is optimal
3. Non-coalesced access reduces bandwidth by 32x
4. Use profiler to identify coalescing issues

## Next StepsContinue to: **12_next_topic.ipynb**

## Notes*Use this space to write your own notes and observations:*------