# Notebook 07: Memory Bandwidth Benchmarking## Phase 2: Memory Management**Learning Objectives:**- Measure host-to-device transfer bandwidth- Compare pinned vs pageable memory performance- Understand PCIe bandwidth limitations- Profile memory access patterns- Optimize data transfer strategies

## Concept: Memory Bandwidth**Theoretical Bandwidth:**- PCIe 3.0 x16: 15.75 GB/s per direction- PCIe 4.0 x16: 31.5 GB/s per direction**Factors Affecting Bandwidth:**- Transfer size (larger is better)- Memory type (pinned vs pageable)- Transfer direction (H2D, D2H, D2D)- Concurrent operations**Bandwidth Calculation:**```Bandwidth = DataSize / Time```

## Example 1: Basic Memory Bandwidth Benchmarking

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error: %s\n", cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

float measureBandwidth(size_t size, bool pinned) {
    float *h_data, *d_data;

    // Allocate memory
    if (pinned) {
        CUDA_CHECK(cudaMallocHost(&h_data, size));
    } else {
        h_data = (float*)malloc(size);
    }
    CUDA_CHECK(cudaMalloc(&d_data, size));

    // Initialize data
    for (size_t i = 0; i < size/sizeof(float); i++) {
        h_data[i] = i * 1.0f;
    }

    // Measure transfer time
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    CUDA_CHECK(cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice));
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    // Calculate bandwidth
    float bandwidth = (size / 1e9) / (milliseconds / 1000.0);

    // Cleanup
    if (pinned) {
        cudaFreeHost(h_data);
    } else {
        free(h_data);
    }
    cudaFree(d_data);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return bandwidth;
}

int main() {
    printf("=== Memory Bandwidth Measurement ===\n\n");

    // Test different sizes
    size_t sizes[] = {1<<20, 1<<22, 1<<24, 1<<26};  // 1MB to 64MB

    printf("%-15s %-20s %-20s\n", "Size", "Pageable (GB/s)", "Pinned (GB/s)");
    printf("-----------------------------------------------------\n");

    for (int i = 0; i < 4; i++) {
        size_t size = sizes[i];
        float bw_pageable = measureBandwidth(size, false);
        float bw_pinned = measureBandwidth(size, true);

        printf("%-15zu %-20.2f %-20.2f\n",
               size / (1024*1024), bw_pageable, bw_pinned);
    }

    printf("\nPinned memory provides significantly better bandwidth!\n");

    return 0;
}

## Practical ExerciseComplete the following exercises to practice the concepts learned.

In [None]:
%%cu
#include <stdio.h>

__global__ void kernel() {
    printf("Example for: measure_transfer_size_impact\n");
}

int main() {
    printf("=== Measure Transfer Size Impact ===\n\n");

    kernel<<<1, 32>>>();
    cudaDeviceSynchronize();

    printf("\nExample completed successfully!\n");
    return 0;
}

## Key Takeaways

1. PCIe bandwidth limits and theoretical maximums
2. Effective bandwidth vs peak bandwidth
3. Measuring memory transfer performance
4. Factors affecting bandwidth (transfer size, pinned memory)

## Next StepsContinue to: **08_next_topic.ipynb**

## Notes*Use this space to write your own notes and observations:*------