# Notebook 32: Events and Performance Timing## Phase 6: Streams & Concurrency**Learning Objectives:**- Understand cuda events- Learn timing- Master synchronization- Apply concepts in practical scenarios- Measure and analyze performance

## Concept: Events and Performance Timing**Topics Covered:**- cuda events- timing- synchronization**Key Concepts:**This notebook covers cuda events in the context of Phase 6: Streams & Concurrency.

## Example 1: Basic Events and Performance Timing

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <math.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)


__global__ void workKernel(float* data, int n, int iterations) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        float val = data[idx];
        for (int i = 0; i < iterations; i++)
            val = sqrtf(val + 1.0f);
        data[idx] = val;
    }
}

int main() {
    printf("=== Events and Timing ===\n\n");
    const int N = 1 << 20;
    size_t bytes = N * sizeof(float);

    float *h_data = (float*)malloc(bytes);
    for (int i = 0; i < N; i++) h_data[i] = (float)i;

    float *d_data;
    CUDA_CHECK(cudaMalloc(&d_data, bytes));
    CUDA_CHECK(cudaMemcpy(d_data, h_data, bytes, cudaMemcpyHostToDevice));

    cudaEvent_t start, stop, kernel1, kernel2;
    CUDA_CHECK(cudaEventCreate(&start));
    CUDA_CHECK(cudaEventCreate(&stop));
    CUDA_CHECK(cudaEventCreate(&kernel1));
    CUDA_CHECK(cudaEventCreate(&kernel2));

    CUDA_CHECK(cudaEventRecord(start));

    workKernel<<<(N + 255) / 256, 256>>>(d_data, N, 50);
    CUDA_CHECK(cudaEventRecord(kernel1));

    workKernel<<<(N + 255) / 256, 256>>>(d_data, N, 100);
    CUDA_CHECK(cudaEventRecord(kernel2));

    workKernel<<<(N + 255) / 256, 256>>>(d_data, N, 150);
    CUDA_CHECK(cudaEventRecord(stop));

    CUDA_CHECK(cudaEventSynchronize(stop));

    float ms_total, ms_k1, ms_k2, ms_k3;
    CUDA_CHECK(cudaEventElapsedTime(&ms_total, start, stop));
    CUDA_CHECK(cudaEventElapsedTime(&ms_k1, start, kernel1));
    CUDA_CHECK(cudaEventElapsedTime(&ms_k2, kernel1, kernel2));
    CUDA_CHECK(cudaEventElapsedTime(&ms_k3, kernel2, stop));

    printf("Kernel 1 (50 iters):  %.3f ms\n", ms_k1);
    printf("Kernel 2 (100 iters): %.3f ms\n", ms_k2);
    printf("Kernel 3 (150 iters): %.3f ms\n", ms_k3);
    printf("Total time:           %.3f ms\n", ms_total);

    free(h_data);
    cudaFree(d_data);
    cudaEventDestroy(start); cudaEventDestroy(stop);
    cudaEventDestroy(kernel1); cudaEventDestroy(kernel2);
    return 0;
}


## Practical ExerciseComplete the following exercises to practice the concepts learned.

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

__global__ void kernel(float *data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = data[idx] * 2.0f;
    }
}

int main() {
    printf("=== Events Timing ===\n\n");

    int n = 1000000;
    size_t size = n * sizeof(float);

    float *h_data = (float*)malloc(size);
    for (int i = 0; i < n; i++) h_data[i] = i;

    float *d_data;
    CUDA_CHECK(cudaMalloc(&d_data, size));
    CUDA_CHECK(cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int threads = 256;
    int blocks = (n + threads - 1) / threads;

    cudaEventRecord(start);
    kernel<<<blocks, threads>>>(d_data, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms;
    cudaEventElapsedTime(&ms, start, stop);

    CUDA_CHECK(cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost));

    printf("Processed %d elements in %.2f ms\n", n, ms);
    printf("Bandwidth: %.2f GB/s\n", (size * 2 / 1e9) / (ms / 1000.0));

    free(h_data);
    cudaFree(d_data);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

## Key Takeaways

1. Events mark points in stream execution
2. Used for timing and synchronization
3. cudaEventRecord and cudaEventSynchronize
4. More accurate than CPU timing

## Next StepsContinue to: **33_next_topic.ipynb**

## Notes*Use this space to write your own notes and observations:*------