# Notebook 09: Shared Memory Basics## Phase 2: Memory Management - Shared Memory**Learning Objectives:**- Understand shared memory architecture- Declare and use shared memory- Implement basic tiling with shared memory- Learn synchronization with __syncthreads()- Understand shared memory scope

## Concept: Shared Memory**What is Shared Memory?**- Fast on-chip memory (similar to L1 cache)- Shared among threads in a block- ~100x faster than global memory- Limited size (48-96 KB per SM)**Declaration:**```cuda__shared__ float sharedData[256];  // Static allocationextern __shared__ float sharedData[];  // Dynamic allocation```**Key Points:**- Requires __syncthreads() for synchronization- Per-block scope- Bank conflicts can reduce performance- Ideal for data reuse within block

## Example 1: Basic Shared Memory Basics

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>

#define TILE_SIZE 256

__global__ void arrayReverseShared(float *input, float *output, int n) {
    __shared__ float tile[TILE_SIZE];

    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // Load data into shared memory
    if (idx < n) {
        tile[threadIdx.x] = input[idx];
    }
    __syncthreads();  // Wait for all threads to load

    // Write in reverse order
    if (idx < n) {
        output[idx] = tile[blockDim.x - 1 - threadIdx.x];
    }
}

__global__ void arrayReverseGlobal(float *input, float *output, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < n) {
        int reverseIdx = (blockIdx.x + 1) * blockDim.x - 1 - threadIdx.x;
        if (reverseIdx < n) {
            output[idx] = input[reverseIdx];
        }
    }
}

int main() {
    int n = 1000000;
    size_t size = n * sizeof(float);

    printf("=== Shared Memory Demonstration ===\n\n");

    float *h_input = (float*)malloc(size);
    float *h_output = (float*)malloc(size);

    for (int i = 0; i < n; i++) h_input[i] = i;

    float *d_input, *d_output;
    cudaMalloc(&d_input, size);
    cudaMalloc(&d_output, size);
    cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int threadsPerBlock = TILE_SIZE;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;

    // Test with shared memory
    cudaEventRecord(start);
    arrayReverseShared<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_output, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float time_shared;
    cudaEventElapsedTime(&time_shared, start, stop);

    // Test without shared memory
    cudaEventRecord(start);
    arrayReverseGlobal<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_output, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float time_global;
    cudaEventElapsedTime(&time_global, start, stop);

    cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);

    printf("Shared memory: %.3f ms\n", time_shared);
    printf("Global memory: %.3f ms\n", time_global);
    printf("Speedup: %.2fx\n\n", time_global / time_shared);

    printf("Shared memory is faster due to on-chip access!\n");

    free(h_input); free(h_output);
    cudaFree(d_input); cudaFree(d_output);
    cudaEventDestroy(start); cudaEventDestroy(stop);

    return 0;
}

## Practical ExerciseComplete the following exercises to practice the concepts learned.

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>

#define TILE_SIZE 256

__global__ void arrayReverseShared(float *input, float *output, int n) {
    __shared__ float tile[TILE_SIZE];

    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // Load data into shared memory
    if (idx < n) {
        tile[threadIdx.x] = input[idx];
    }
    __syncthreads();  // Wait for all threads to load

    // Write in reverse order
    if (idx < n) {
        output[idx] = tile[blockDim.x - 1 - threadIdx.x];
    }
}

__global__ void arrayReverseGlobal(float *input, float *output, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < n) {
        int reverseIdx = (blockIdx.x + 1) * blockDim.x - 1 - threadIdx.x;
        if (reverseIdx < n) {
            output[idx] = input[reverseIdx];
        }
    }
}

int main() {
    int n = 1000000;
    size_t size = n * sizeof(float);

    printf("=== Shared Memory Demonstration ===\n\n");

    float *h_input = (float*)malloc(size);
    float *h_output = (float*)malloc(size);

    for (int i = 0; i < n; i++) h_input[i] = i;

    float *d_input, *d_output;
    cudaMalloc(&d_input, size);
    cudaMalloc(&d_output, size);
    cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int threadsPerBlock = TILE_SIZE;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;

    // Test with shared memory
    cudaEventRecord(start);
    arrayReverseShared<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_output, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float time_shared;
    cudaEventElapsedTime(&time_shared, start, stop);

    // Test without shared memory
    cudaEventRecord(start);
    arrayReverseGlobal<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_output, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float time_global;
    cudaEventElapsedTime(&time_global, start, stop);

    cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);

    printf("Shared memory: %.3f ms\n", time_shared);
    printf("Global memory: %.3f ms\n", time_global);
    printf("Speedup: %.2fx\n\n", time_global / time_shared);

    printf("Shared memory is faster due to on-chip access!\n");

    free(h_input); free(h_output);
    cudaFree(d_input); cudaFree(d_output);
    cudaEventDestroy(start); cudaEventDestroy(stop);

    return 0;
}

## Key Takeaways

1. Shared memory is on-chip, fast memory
2. Shared among threads in a block
3. Declared with __shared__ keyword
4. Requires __syncthreads() for synchronization

## Next StepsContinue to: **10_next_topic.ipynb**

## Notes*Use this space to write your own notes and observations:*------