# Notebook 05: Advanced Thread Indexing
## Phase 1: Foundations - Thread Hierarchy & Kernel Basics

**Learning Objectives:**
- Master thread indexing in 1D, 2D, and 3D configurations
- Understand grid-stride loops
- Handle arbitrary data sizes efficiently
- Learn best practices for thread indexing
- Implement reusable indexing patterns

## Concept: Thread Indexing Patterns

**1D Indexing:**
```cuda
int idx = blockIdx.x * blockDim.x + threadIdx.x;
```

**2D Indexing:**
```cuda
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int idx = row * width + col;
```

**3D Indexing:**
```cuda
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int z = blockIdx.z * blockDim.z + threadIdx.z;
int idx = z * width * height + y * width + x;
```

**Grid-Stride Loop:**
```cuda
int stride = blockDim.x * gridDim.x;
for (int i = idx; i < n; i += stride) {
    // Process element i
}
```

## Example 1: Grid-Stride Loop for Large Arrays

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>

__global__ void vectorAddGridStride(float *a, float *b, float *c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    
    // Grid-stride loop handles any array size
    for (int i = idx; i < n; i += stride) {
        c[i] = a[i] + b[i];
    }
}

int main() {
    int n = 10000000;  // 10 million elements
    size_t size = n * sizeof(float);
    
    float *h_a = (float*)malloc(size);
    float *h_b = (float*)malloc(size);
    float *h_c = (float*)malloc(size);
    
    for (int i = 0; i < n; i++) {
        h_a[i] = i * 1.0f;
        h_b[i] = i * 2.0f;
    }
    
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);
    
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
    
    // Launch with fewer blocks than elements
    int threadsPerBlock = 256;
    int blocksPerGrid = 1024;  // Fixed number of blocks
    
    printf("Array size: %d elements\n", n);
    printf("Blocks: %d, Threads/block: %d\n", blocksPerGrid, threadsPerBlock);
    printf("Total threads: %d\n", blocksPerGrid * threadsPerBlock);
    printf("Each thread processes ~%d elements\n", n / (blocksPerGrid * threadsPerBlock));
    
    vectorAddGridStride<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
    
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
    
    // Verify
    bool correct = true;
    for (int i = 0; i < n; i++) {
        if (h_c[i] != h_a[i] + h_b[i]) {
            printf("Error at %d\n", i);
            correct = false;
            break;
        }
    }
    printf("Result: %s\n", correct ? "CORRECT" : "INCORRECT");
    
    free(h_a); free(h_b); free(h_c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    
    return 0;
}

## Example 2: 3D Thread Indexing for Volume Data

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>

__global__ void volumeAdd(float *a, float *b, float *c, int width, int height, int depth) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int z = blockIdx.z * blockDim.z + threadIdx.z;
    
    if (x < width && y < height && z < depth) {
        int idx = z * (width * height) + y * width + x;
        c[idx] = a[idx] + b[idx];
        
        // Print info for first few elements
        if (idx < 5) {
            printf("Volume[%d][%d][%d] (idx=%d): Block(%d,%d,%d) Thread(%d,%d,%d)\n",
                   z, y, x, idx,
                   blockIdx.z, blockIdx.y, blockIdx.x,
                   threadIdx.z, threadIdx.y, threadIdx.x);
        }
    }
}

int main() {
    int width = 32;
    int height = 32;
    int depth = 32;
    int totalElements = width * height * depth;
    size_t size = totalElements * sizeof(float);
    
    printf("Volume size: %dx%dx%d = %d elements\n", width, height, depth, totalElements);
    
    float *h_a = (float*)malloc(size);
    float *h_b = (float*)malloc(size);
    float *h_c = (float*)malloc(size);
    
    for (int i = 0; i < totalElements; i++) {
        h_a[i] = i * 1.0f;
        h_b[i] = i * 2.0f;
    }
    
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);
    
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
    
    // 3D configuration
    dim3 blockDim(8, 8, 8);
    dim3 gridDim((width + blockDim.x - 1) / blockDim.x,
                 (height + blockDim.y - 1) / blockDim.y,
                 (depth + blockDim.z - 1) / blockDim.z);
    
    printf("Block dimensions: %dx%dx%d\n", blockDim.x, blockDim.y, blockDim.z);
    printf("Grid dimensions: %dx%dx%d\n\n", gridDim.x, gridDim.y, gridDim.z);
    
    volumeAdd<<<gridDim, blockDim>>>(d_a, d_b, d_c, width, height, depth);
    cudaDeviceSynchronize();
    
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
    
    // Verify
    bool correct = true;
    for (int i = 0; i < totalElements; i++) {
        if (h_c[i] != h_a[i] + h_b[i]) {
            correct = false;
            break;
        }
    }
    printf("\nResult: %s\n", correct ? "CORRECT" : "INCORRECT");
    
    free(h_a); free(h_b); free(h_c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    
    return 0;
}

## Example 3: Strided Access Patterns

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>

// Process every Nth element
__global__ void stridedAccess(float *input, float *output, int n, int stride) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    // Each thread processes elements at stride intervals
    for (int i = idx; i < n; i += stride * blockDim.x * gridDim.x) {
        output[i] = input[i] * 2.0f;
    }
}

// Interleaved access pattern
__global__ void interleavedAccess(float *input, float *output, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int totalThreads = blockDim.x * gridDim.x;
    
    // Each thread processes every Nth element
    for (int i = idx; i < n; i += totalThreads) {
        output[i] = input[i] * 2.0f;
    }
}

int main() {
    int n = 1000000;
    size_t size = n * sizeof(float);
    
    float *h_input = (float*)malloc(size);
    float *h_output = (float*)malloc(size);
    
    for (int i = 0; i < n; i++) {
        h_input[i] = i * 1.0f;
    }
    
    float *d_input, *d_output;
    cudaMalloc(&d_input, size);
    cudaMalloc(&d_output, size);
    
    cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);
    
    int threadsPerBlock = 256;
    int blocksPerGrid = 256;
    
    printf("Testing different access patterns:\n");
    printf("Array size: %d elements\n\n", n);
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    // Test strided access
    cudaEventRecord(start);
    stridedAccess<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_output, n, 1);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float time1;
    cudaEventElapsedTime(&time1, start, stop);
    printf("Strided access: %.3f ms\n", time1);
    
    // Test interleaved access
    cudaEventRecord(start);
    interleavedAccess<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_output, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float time2;
    cudaEventElapsedTime(&time2, start, stop);
    printf("Interleaved access: %.3f ms\n", time2);
    
    cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);
    
    // Verify
    bool correct = true;
    for (int i = 0; i < n; i++) {
        if (h_output[i] != h_input[i] * 2.0f) {
            correct = false;
            break;
        }
    }
    printf("\nResult: %s\n", correct ? "CORRECT" : "INCORRECT");
    
    free(h_input); free(h_output);
    cudaFree(d_input); cudaFree(d_output);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    return 0;
}

## Example 4: Thread ID Calculation Utilities

In [None]:
%%cu
#include <stdio.h>

// Utility functions for thread indexing
__device__ int getGlobalIdx1D() {
    return blockIdx.x * blockDim.x + threadIdx.x;
}

__device__ int getGlobalIdx2D(int width) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    return row * width + col;
}

__device__ int getGlobalIdx3D(int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int z = blockIdx.z * blockDim.z + threadIdx.z;
    return z * (width * height) + y * width + x;
}

__device__ int getTotalThreads1D() {
    return blockDim.x * gridDim.x;
}

__device__ void printThreadInfo() {
    printf("Thread info - Block: (%d,%d,%d), Thread: (%d,%d,%d), Global 1D: %d\n",
           blockIdx.x, blockIdx.y, blockIdx.z,
           threadIdx.x, threadIdx.y, threadIdx.z,
           getGlobalIdx1D());
}

__global__ void demonstrateIndexing() {
    int idx = getGlobalIdx1D();
    
    if (idx < 5) {
        printThreadInfo();
    }
}

__global__ void vectorAddWithUtilities(float *a, float *b, float *c, int n) {
    int idx = getGlobalIdx1D();
    int stride = getTotalThreads1D();
    
    for (int i = idx; i < n; i += stride) {
        c[i] = a[i] + b[i];
    }
}

int main() {
    printf("Demonstrating indexing utilities:\n\n");
    
    demonstrateIndexing<<<2, 4>>>();
    cudaDeviceSynchronize();
    
    // Test with vector addition
    int n = 10000;
    size_t size = n * sizeof(float);
    
    float *h_a = (float*)malloc(size);
    float *h_b = (float*)malloc(size);
    float *h_c = (float*)malloc(size);
    
    for (int i = 0; i < n; i++) {
        h_a[i] = i * 1.0f;
        h_b[i] = i * 2.0f;
    }
    
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);
    
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
    
    vectorAddWithUtilities<<<256, 256>>>(d_a, d_b, d_c, n);
    
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
    
    // Verify
    bool correct = true;
    for (int i = 0; i < n; i++) {
        if (h_c[i] != h_a[i] + h_b[i]) {
            correct = false;
            break;
        }
    }
    printf("\nVector addition with utilities: %s\n", correct ? "CORRECT" : "INCORRECT");
    
    free(h_a); free(h_b); free(h_c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    
    return 0;
}

## Practical Exercise

**Exercise 1:** Implement a kernel that processes every other element (even indices only)

**Exercise 2:** Create a 3D kernel for image processing (RGB channels)

**Exercise 3:** Implement a grid-stride loop for matrix multiplication

**Exercise 4:** Write utility functions for converting between 1D and 2D/3D indices

In [None]:
%%cu
// Your solution here
#include <stdio.h>

__global__ void myKernel() {
    // TODO: Implement your solution
}

int main() {
    // TODO: Test your implementation
    
    return 0;
}

## Key Takeaways

1. **Grid-stride loops** allow handling arrays larger than grid size
2. **3D indexing** useful for volume data and multi-channel images
3. **stride = blockDim.x * gridDim.x** for 1D grid-stride
4. Always **bounds check** with all indexing patterns
5. **Utility functions** improve code readability and reusability
6. Different access patterns have different performance characteristics
7. Choose indexing pattern based on data structure and access pattern

## Next Steps

Congratulations on completing Phase 1! You now understand:
- CUDA basics and GPU architecture
- Thread hierarchy and kernel launches
- Memory management and data transfer
- Thread indexing patterns

In Phase 2, we'll dive into:
- Memory types and optimization
- Shared memory usage
- Memory coalescing
- Bandwidth optimization

Continue to: **Phase 2 - 06_memory_basics.ipynb**

## Notes

*Use this space to write your own notes and observations:*

---



---