# Notebook 38: Kernel Fusion Optimization## Phase 7: Performance Engineering**Learning Objectives:**- Understand kernel fusion- Learn combining operations- Master overhead reduction- Apply concepts in practical scenarios- Measure and analyze performance

## Concept: Kernel Fusion Optimization**Topics Covered:**- kernel fusion- combining operations- overhead reduction**Key Concepts:**This notebook covers kernel fusion in the context of Phase 7: Performance Engineering.

## Example 1: Basic Kernel Fusion Optimization

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

// Separate kernels (not fused)
__global__ void kernel1(float *data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = data[idx] * 2.0f;
    }
}

__global__ void kernel2(float *data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = data[idx] + 10.0f;
    }
}

__global__ void kernel3(float *data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = sqrtf(data[idx]);
    }
}

// Fused kernel (all operations in one)
__global__ void fusedKernel(float *data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        float val = data[idx];
        val = val * 2.0f;      // Operation 1
        val = val + 10.0f;     // Operation 2
        val = sqrtf(val);      // Operation 3
        data[idx] = val;
    }
}

int main() {
    printf("=== Kernel Fusion ===\n\n");

    int n = 1 << 24;
    size_t size = n * sizeof(float);

    float *h_data = (float*)malloc(size);
    for (int i = 0; i < n; i++) h_data[i] = i + 1.0f;

    float *d_data;
    CUDA_CHECK(cudaMalloc(&d_data, size));

    int threads = 256;
    int blocks = (n + threads - 1) / threads;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Test 1: Separate kernels
    CUDA_CHECK(cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice));

    cudaEventRecord(start);
    kernel1<<<blocks, threads>>>(d_data, n);
    kernel2<<<blocks, threads>>>(d_data, n);
    kernel3<<<blocks, threads>>>(d_data, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float separateTime;
    cudaEventElapsedTime(&separateTime, start, stop);

    // Test 2: Fused kernel
    CUDA_CHECK(cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice));

    cudaEventRecord(start);
    fusedKernel<<<blocks, threads>>>(d_data, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float fusedTime;
    cudaEventElapsedTime(&fusedTime, start, stop);

    CUDA_CHECK(cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost));

    // Verify
    bool correct = true;
    for (int i = 0; i < 1000; i++) {
        float expected = sqrtf((i + 1.0f) * 2.0f + 10.0f);
        if (abs(h_data[i] - expected) > 1e-3) {
            correct = false;
            break;
        }
    }

    printf("Result: %s\n", correct ? "CORRECT" : "INCORRECT");
    printf("\nSeparate kernels: %.2f ms\n", separateTime);
    printf("Fused kernel:     %.2f ms\n", fusedTime);
    printf("Speedup:          %.2fx\n", separateTime / fusedTime);
    printf("\nFusion reduces:\n");
    printf("  - Global memory accesses\n");
    printf("  - Kernel launch overhead\n");
    printf("  - Device synchronization\n");

    free(h_data);
    cudaFree(d_data);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}


## Practical ExerciseComplete the following exercises to practice the concepts learned.

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

__global__ void kernel(float *data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = data[idx] * 2.0f;
    }
}

int main() {
    printf("=== Kernel Fusion ===\n\n");

    int n = 1000000;
    size_t size = n * sizeof(float);

    float *h_data = (float*)malloc(size);
    for (int i = 0; i < n; i++) h_data[i] = i;

    float *d_data;
    CUDA_CHECK(cudaMalloc(&d_data, size));
    CUDA_CHECK(cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int threads = 256;
    int blocks = (n + threads - 1) / threads;

    cudaEventRecord(start);
    kernel<<<blocks, threads>>>(d_data, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms;
    cudaEventElapsedTime(&ms, start, stop);

    CUDA_CHECK(cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost));

    printf("Processed %d elements in %.2f ms\n", n, ms);
    printf("Bandwidth: %.2f GB/s\n", (size * 2 / 1e9) / (ms / 1000.0));

    free(h_data);
    cudaFree(d_data);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

## Key Takeaways

1. Combine multiple kernels into one
2. Reduces kernel launch overhead
3. Improves data locality
4. Trade-off: complexity vs performance

## Next StepsContinue to: **39_next_topic.ipynb**

## Notes*Use this space to write your own notes and observations:*------