# 01 Stream Basics**FreeCodeCamp CUDA Course - Module 5**Original Course: [https://www.youtube.com/watch?v=86FAWCzIe_4](https://www.youtube.com/watch?v=86FAWCzIe_4)Source File: `01_stream_basics.cu`---## OverviewLeverage CUDA streams for concurrent execution.---

## Learning ObjectivesBy the end of this notebook, you will:1. Understand CUDA kernel syntax and execution2. Learn GPU memory allocation and data transfer3. Work with CUDA streams for async execution---

## SetupMake sure you've completed the setup from the first notebook (GPU enabled, nvcc4jupyter installed).---

## Key Concepts- **Kernel Function**: Uses `__global__` qualifier for GPU execution- **Device Memory**: Allocated using `cudaMalloc`- **Data Transfer**: Uses `cudaMemcpy` between host and device- **Kernel Launch**: Syntax `kernel<<<blocks, threads>>>(...)`---## CUDA Implementation

In [None]:
%%cu#include <cuda_runtime.h>#include <stdio.h>#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)template <typename T>void check(T err, const char* const func, const char* const file, const int line) {    if (err != cudaSuccess) {        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast<unsigned int>(err), cudaGetErrorString(err), func);        exit(EXIT_FAILURE);    }}__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) {    int i = blockDim.x * blockIdx.x + threadIdx.x;    if (i < numElements) {        C[i] = A[i] + B[i];    }}int main(void) {    int numElements = 50000;    size_t size = numElements * sizeof(float);    float *h_A, *h_B, *h_C;    float *d_A, *d_B, *d_C;    cudaStream_t stream1, stream2;    // Allocate host memory    h_A = (float *)malloc(size);    h_B = (float *)malloc(size);    h_C = (float *)malloc(size);    // Initialize host arrays    for (int i = 0; i < numElements; ++i) {        h_A[i] = rand() / (float)RAND_MAX;        h_B[i] = rand() / (float)RAND_MAX;    }    // Allocate device memory    CHECK_CUDA_ERROR(cudaMalloc((void **)&d_A, size));    CHECK_CUDA_ERROR(cudaMalloc((void **)&d_B, size));    CHECK_CUDA_ERROR(cudaMalloc((void **)&d_C, size));    // Create streams    CHECK_CUDA_ERROR(cudaStreamCreate(&stream1));    CHECK_CUDA_ERROR(cudaStreamCreate(&stream2));    // Copy inputs to device asynchronously    CHECK_CUDA_ERROR(cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream1));    CHECK_CUDA_ERROR(cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream2));        // make sure d_B is copied before launching kernel that uses it    CHECK_CUDA_ERROR(cudaStreamSynchronize(stream2));    // Launch kernels    int threadsPerBlock = 256;    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;    vectorAdd<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(d_A, d_B, d_C, numElements);    // Copy result back to host asynchronously    CHECK_CUDA_ERROR(cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream1));    // Synchronize streams    CHECK_CUDA_ERROR(cudaStreamSynchronize(stream1));    CHECK_CUDA_ERROR(cudaStreamSynchronize(stream2));    // Verify result    for (int i = 0; i < numElements; ++i) {        if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {            fprintf(stderr, "Result verification failed at element %d!\n", i);            exit(EXIT_FAILURE);        }    }    printf("Test PASSED\n");    // Clean up    CHECK_CUDA_ERROR(cudaFree(d_A));    CHECK_CUDA_ERROR(cudaFree(d_B));    CHECK_CUDA_ERROR(cudaFree(d_C));    CHECK_CUDA_ERROR(cudaStreamDestroy(stream1));    CHECK_CUDA_ERROR(cudaStreamDestroy(stream2));    free(h_A);    free(h_B);    free(h_C);    return 0;}

## ExercisesPractice with CUDA streams:1. **More Streams**: Create 4+ streams and run operations in parallel2. **Measure Speedup**: Compare single stream vs multiple streams3. **Stream Dependencies**: Use events to create complex dependencies4. **Overlap Computation**: Try overlapping data transfer with computation

---## Key Takeaways- CUDA enables massive parallelism for compute-intensive tasks- Proper memory management is crucial for performance- Understanding the thread hierarchy helps write efficient kernels- Always synchronize when needed to ensure correctness---## Next StepsContinue to the next notebook in Module 5 to learn more CUDA concepts!---## Notes*Use this space for your learning notes:*