# Notebook 29: Thrust Library Examples## Phase 5: Advanced Algorithms**Learning Objectives:**- Understand Thrust- Learn STL-like- Master high-level algorithms- Apply concepts in practical scenarios- Measure and analyze performance

## Concept: Thrust Library Examples**Topics Covered:**- Thrust- STL-like- high-level algorithms**Key Concepts:**This notebook covers Thrust in the context of Phase 5: Advanced Algorithms.

## Example 1: Basic Thrust Library Examples

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <math.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)


#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/transform.h>

struct square_functor {
    __host__ __device__
    float operator()(float x) const {
        return x * x;
    }
};

int main() {
    printf("=== Thrust Examples ===\n\n");
    const int N = 1 << 20;

    thrust::host_vector<float> h_vec(N);
    for (int i = 0; i < N; i++) h_vec[i] = (float)(rand() % 100);

    thrust::device_vector<float> d_vec = h_vec;

    printf("Thrust: C++ STL-like interface for CUDA\n\n");

    printf("1. Reduction:\n");
    float sum = thrust::reduce(d_vec.begin(), d_vec.end(), 0.0f, thrust::plus<float>());
    printf("   Sum: %.2f\n\n", sum);

    printf("2. Transform (square each element):\n");
    thrust::transform(d_vec.begin(), d_vec.end(), d_vec.begin(), square_functor());
    printf("   Transform complete\n\n");

    printf("3. Sort:\n");
    thrust::sort(d_vec.begin(), d_vec.end());
    printf("   Sort complete\n\n");

    thrust::host_vector<float> h_result = d_vec;
    printf("First 5 sorted: %.0f %.0f %.0f %.0f %.0f\n",
           h_result[0], h_result[1], h_result[2], h_result[3], h_result[4]);

    return 0;
}


## Practical ExerciseComplete the following exercises to practice the concepts learned.

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

__global__ void kernel(float *data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = data[idx] * 2.0f;
    }
}

int main() {
    printf("=== Thrust Examples ===\n\n");

    int n = 1000000;
    size_t size = n * sizeof(float);

    float *h_data = (float*)malloc(size);
    for (int i = 0; i < n; i++) h_data[i] = i;

    float *d_data;
    CUDA_CHECK(cudaMalloc(&d_data, size));
    CUDA_CHECK(cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int threads = 256;
    int blocks = (n + threads - 1) / threads;

    cudaEventRecord(start);
    kernel<<<blocks, threads>>>(d_data, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms;
    cudaEventElapsedTime(&ms, start, stop);

    CUDA_CHECK(cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost));

    printf("Processed %d elements in %.2f ms\n", n, ms);
    printf("Bandwidth: %.2f GB/s\n", (size * 2 / 1e9) / (ms / 1000.0));

    free(h_data);
    cudaFree(d_data);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

## Key Takeaways

1. Thrust = C++ template library for CUDA
2. STL-like interface
3. Automatic memory management
4. Highly optimized primitives

## Next StepsContinue to: **30_next_topic.ipynb**

## Notes*Use this space to write your own notes and observations:*------