# Notebook 08: Unified Memory and Managed Memory## Phase 2: Memory Management**Learning Objectives:**- Understand Unified Memory concept- Use cudaMallocManaged for simpler code- Learn about page faults and migration- Understand performance implications- Use prefetching for optimization

## Concept: Unified Memory**Unified Memory:**- Single pointer accessible from CPU and GPU- Automatic data migration- Simpler code, but requires understanding**Functions:**```cudacudaMallocManaged(&ptr, size);      // Allocate managed memorycudaMemPrefetchAsync(ptr, size, device);  // Prefetch datacudaMemAdvise(ptr, size, advice, device); // Give hints```**Benefits:**- Simplified memory management- Automatic migration- Oversubscription support**Considerations:**- Page fault overhead- Migration costs- Requires compute capability 6.0+

## Example 1: Basic Unified Memory and Managed Memory

In [None]:
%%cu
#include <stdio.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error: %s\n", cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

__global__ void vectorAdd(float *a, float *b, float *c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    int n = 1000000;
    size_t size = n * sizeof(float);

    printf("=== Unified Memory Basic Example ===\n\n");

    // Allocate unified memory (accessible from both CPU and GPU)
    float *a, *b, *c;
    CUDA_CHECK(cudaMallocManaged(&a, size));
    CUDA_CHECK(cudaMallocManaged(&b, size));
    CUDA_CHECK(cudaMallocManaged(&c, size));

    printf("✓ Allocated unified memory: %zu MB\n", size * 3 / (1024*1024));

    // Initialize on CPU (no explicit transfer needed!)
    for (int i = 0; i < n; i++) {
        a[i] = i * 1.0f;
        b[i] = i * 2.0f;
    }
    printf("✓ Initialized data on CPU\n");

    // Launch kernel (data automatically migrated to GPU)
    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;

    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(a, b, c, n);
    CUDA_CHECK(cudaDeviceSynchronize());
    printf("✓ Kernel executed on GPU\n");

    // Access result on CPU (automatically migrated back)
    bool correct = true;
    for (int i = 0; i < n; i++) {
        if (c[i] != a[i] + b[i]) {
            correct = false;
            break;
        }
    }
    printf("✓ Verified results on CPU\n");
    printf("\nResult: %s\n", correct ? "CORRECT" : "INCORRECT");
    printf("Sample: %.1f + %.1f = %.1f\n", a[0], b[0], c[0]);

    // Single free call for unified memory
    cudaFree(a);
    cudaFree(b);
    cudaFree(c);

    printf("\nAdvantages: No explicit cudaMemcpy calls!\n");
    printf("Disadvantage: Implicit transfers may be slower\n");

    return 0;
}

## Practical ExerciseComplete the following exercises to practice the concepts learned.

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

__global__ void vectorAdd(float *a, float *b, float *c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) c[idx] = a[idx] + b[idx];
}

int main() {
    printf("=== Unified Memory ===\n\n");

    int n = 1000000;
    size_t size = n * sizeof(float);

    // Allocate unified memory
    float *a, *b, *c;
    CUDA_CHECK(cudaMallocManaged(&a, size));
    CUDA_CHECK(cudaMallocManaged(&b, size));
    CUDA_CHECK(cudaMallocManaged(&c, size));

    // Initialize on CPU
    for (int i = 0; i < n; i++) {
        a[i] = i * 1.0f;
        b[i] = i * 2.0f;
    }

    // Launch kernel
    int threads = 256;
    int blocks = (n + threads - 1) / threads;
    vectorAdd<<<blocks, threads>>>(a, b, c, n);
    CUDA_CHECK(cudaDeviceSynchronize());

    // Verify on CPU
    bool correct = true;
    for (int i = 0; i < n; i++) {
        if (c[i] != a[i] + b[i]) {
            correct = false;
            break;
        }
    }

    printf("Result: %s\n", correct ? "PASSED" : "FAILED");
    printf("Unified memory simplifies programming!\n");

    cudaFree(a); cudaFree(b); cudaFree(c);
    return 0;
}

## Key Takeaways

1. Unified Memory simplifies memory management
2. cudaMallocManaged for automatic migration
3. Page faulting and data migration
4. Prefetching with cudaMemPrefetchAsync

## Next StepsContinue to: **09_next_topic.ipynb**

## Notes*Use this space to write your own notes and observations:*------