# Notebook 18: Texture Memory and Caching## Phase 4: Advanced Memory & Synchronization**Learning Objectives:**- Understand texture memory- Learn texture objects- Master image filtering- Apply concepts in practical scenarios- Measure and analyze performance

## Concept: Texture Memory and Caching**Topics Covered:**- texture memory- texture objects- image filtering**Key Concepts:**This notebook covers texture memory in the context of Phase 4: Advanced Memory & Synchronization.

## Example 1: Basic Texture Memory and Caching

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <math.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)


texture<float, cudaTextureType2D, cudaReadModeElementType> texRef;

__global__ void textureKernel(float* output, int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < width && y < height) {
        float value = tex2D(texRef, x + 0.5f, y + 0.5f);
        output[y * width + x] = value;
    }
}

int main() {
    printf("=== Texture Memory ===\n\n");
    const int WIDTH = 1024, HEIGHT = 1024;
    size_t bytes = WIDTH * HEIGHT * sizeof(float);

    float *h_input = (float*)malloc(bytes);
    for (int i = 0; i < WIDTH * HEIGHT; i++) h_input[i] = (float)i;

    float *d_output;
    CUDA_CHECK(cudaMalloc(&d_output, bytes));

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
    cudaArray* cuArray;
    CUDA_CHECK(cudaMallocArray(&cuArray, &channelDesc, WIDTH, HEIGHT));
    CUDA_CHECK(cudaMemcpyToArray(cuArray, 0, 0, h_input, bytes, cudaMemcpyHostToDevice));

    texRef.addressMode[0] = cudaAddressModeClamp;
    texRef.addressMode[1] = cudaAddressModeClamp;
    texRef.filterMode = cudaFilterModeLinear;
    texRef.normalized = false;

    CUDA_CHECK(cudaBindTextureToArray(texRef, cuArray, channelDesc));

    dim3 threads(16, 16);
    dim3 blocks((WIDTH + 15) / 16, (HEIGHT + 15) / 16);

    textureKernel<<<blocks, threads>>>(d_output, WIDTH, HEIGHT);
    CUDA_CHECK(cudaDeviceSynchronize());

    printf("Texture memory kernel executed successfully\n");
    printf("Benefits: cached, hardware interpolation, good for 2D access\n");

    CUDA_CHECK(cudaUnbindTexture(texRef));
    CUDA_CHECK(cudaFreeArray(cuArray));
    free(h_input);
    cudaFree(d_output);
    return 0;
}


## Practical ExerciseComplete the following exercises to practice the concepts learned.

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

__global__ void kernel(float *data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = data[idx] * 2.0f;
    }
}

int main() {
    printf("=== Texture Memory ===\n\n");

    int n = 1000000;
    size_t size = n * sizeof(float);

    float *h_data = (float*)malloc(size);
    for (int i = 0; i < n; i++) h_data[i] = i;

    float *d_data;
    CUDA_CHECK(cudaMalloc(&d_data, size));
    CUDA_CHECK(cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int threads = 256;
    int blocks = (n + threads - 1) / threads;

    cudaEventRecord(start);
    kernel<<<blocks, threads>>>(d_data, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms;
    cudaEventElapsedTime(&ms, start, stop);

    CUDA_CHECK(cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost));

    printf("Processed %d elements in %.2f ms\n", n, ms);
    printf("Bandwidth: %.2f GB/s\n", (size * 2 / 1e9) / (ms / 1000.0));

    free(h_data);
    cudaFree(d_data);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

## Key Takeaways

1. Texture memory optimized for 2D spatial locality
2. Hardware interpolation and clamping
3. Read-only from kernels
4. Useful for image processing

## Next StepsContinue to: **19_next_topic.ipynb**

## Notes*Use this space to write your own notes and observations:*------