In [1]:
# Load the extension that allows us to compile CUDA code in python notebooks
# Documentation is here: https://nvcc4jupyter.readthedocs.io/en/latest/
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc4jupyter

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-jwmwnxph
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-jwmwnxph
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 28f872a2f99a1b201bcd0db14fdbc5a496b9bfd7
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: nvcc4jupyter
  Building wheel for nvcc4jupyter (pyproject.toml) ... [?25l[?25hdone
  Created wheel for nvcc4jupyter: filename=nvcc4jupyter-1.2.1-py3-none-any.whl size=10743 sha256=3a570d3ab74e9051a615aa78d54722c78a69a0c83a1d407fccde14c37a30e96c
  Stored in directory: /tmp/pip-ephem-wheel-cache-0u46w5qn/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully bu

In [2]:
%%cuda_group_save -g "source" -n "data_types.h"
/**
 * A collection of commonly used data types throughout this project.
 */
#pragma once

#include <stdint.h> // uint32_t

using element_t = uint32_t;

In [3]:
%%cuda_group_save -g "source" -n "cuda_common.h"
/**
 * Standard macros that can be useful for error checking.
 * https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__ERROR.html
 */
#pragma once

#include <cuda.h>

#define CUDA_CALL(exp)                                       \
    do {                                                     \
        cudaError res = (exp);                               \
        if(res != cudaSuccess) {                             \
            printf("Error at %s:%d\n %s\n",                  \
                __FILE__,__LINE__, cudaGetErrorString(res)); \
           exit(EXIT_FAILURE);                               \
        }                                                    \
    } while(0)

#define CHECK_ERROR(msg)                                             \
    do {                                                             \
        cudaError_t err = cudaGetLastError();                        \
        if(cudaSuccess != err) {                                     \
            printf("Error (%s) at %s:%d\n %s\n",                     \
                (msg), __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE);                                      \
        }                                                            \
    } while (0)

In [4]:
%%cuda_group_save -g "source" -n "data_generator.h"
/**
 * Functions for generating random input data with a fixed seed
 */
#pragma once

#include <random>  // for std::mt19937, std::uniform_int_distribution
#include <vector>

#include "data_types.h"

namespace csc485b {
namespace a1 {

/**
 * Generates and returns a vector of random uniform data of a given length, n,
 * for any integral type. Input range will be [0, 2n].
 */
template < typename T >
std::vector< T > generate_uniform( std::size_t n )
{
    // for details of random number generation, see:
    // https://en.cppreference.com/w/cpp/numeric/random/uniform_int_distribution
    std::size_t random_seed = 20240916;  // use magic seed
    std::mt19937 rng( random_seed );     // use mersenne twister generator
    std::uniform_int_distribution<> distrib(0, 2 * n);

    std::vector< T > random_data( n ); // init array
    std::generate( std::begin( random_data )
                 , std::end  ( random_data )
                 , [ &rng, &distrib ](){ return static_cast< T >( distrib( rng ) ); });

    return random_data;
}

} // namespace a1
} // namespace csc485b

In [5]:
%%cuda_group_save -g "source" -n "algorithm_choices.h"
#pragma once

#include <vector>

#include "data_types.h"

namespace csc485b {
namespace a1 {
namespace cpu {

void run_cpu_baseline( std::vector< element_t > data, std::size_t switch_at, std::size_t n );

} // namespace cpu


namespace gpu {

void run_gpu_soln( std::vector< element_t > data, std::size_t switch_at, std::size_t n );

} // namespace gpu
} // namespace a1
} // namespace csc485b

In [6]:
%%cuda_group_save -g "source" -n "cpu_baseline.cu"
/**
 * CPU methods that the GPU should outperform.
 */

#include "algorithm_choices.h"

#include <algorithm> // std::sort()
#include <chrono>    // for timing
#include <iostream>  // std::cout, std::endl

namespace csc485b {
namespace a1      {
namespace cpu     {

/**
 * Simple solution that just sorts the whole array with a built-in sort
 * function and then resorts the last portion in the opposing order with
 * a second call to that same built-in sort function.
 */
void opposing_sort( element_t * data, std::size_t invert_at_pos, std::size_t num_elements )
{
    std::sort( data, data + num_elements, std::less< element_t >{} );
    std::sort( data + invert_at_pos, data + num_elements, std::greater< element_t >{} );
}

/**
 * Run the single-threaded CPU baseline that students are supposed to outperform
 * in order to obtain higher grades on this assignment. Times the execution and
 * prints to the standard output (e.g., the screen) that "wall time." Note that
 * the functions takes the input by value so as to not perturb the original data
 * in place.
 */
void run_cpu_baseline( std::vector< element_t > data, std::size_t switch_at, std::size_t n )
{
    auto const cpu_start = std::chrono::high_resolution_clock::now();
    opposing_sort( data.data(), switch_at, n );
    auto const cpu_end = std::chrono::high_resolution_clock::now();

    std::cout << "CPU Baseline time: "
              << std::chrono::duration_cast<std::chrono::nanoseconds>(cpu_end - cpu_start).count()
              << " ns" << std::endl;

    for( auto const x : data ) std::cout << x << " "; std::cout << std::endl;
}

} // namespace cpu
} // namespace a1
} // namespace csc485b

In [17]:
%%cuda_group_save -g "source" -n "gpu_solution.cu"
/**
 * The file in which you will implement your GPU solutions!
 */

#include "algorithm_choices.h"

#include <chrono>    // for timing
#include <iostream>  // std::cout, std::endl

#include "cuda_common.h"

namespace csc485b {
namespace a1      {
namespace gpu     {

/**
 * The CPU baseline benefits from warm caches because the data was generated on
 * the CPU. Run the data through the GPU once with some arbitrary logic to
 * ensure that the GPU cache is warm too and the comparison is more fair.
 */
__global__
void warm_the_gpu( element_t * data, std::size_t invert_at_pos, std::size_t num_elements )
{
    int const th_id = blockIdx.x * blockDim.x + threadIdx.x;

    // We know this will never be true, because of the data generator logic,
    // but I doubt that the compiler will figure it out. Thus every element
    // should be read, but none of them should be modified.
    if( th_id < num_elements && data[ th_id ] > num_elements * 100 )
    {
        ++data[ th_id ]; // should not be possible.
    }
}

/**
 * Your solution. Should match the CPU output.
 */
__global__
void opposing_sort( element_t * data, std::size_t invert_at_pos, std::size_t num_elements )
{
    int const th_id = blockIdx.x * blockDim.x + threadIdx.x;

    if( th_id < num_elements )
    {
        // IMPLEMENT ME!
        return;
    }
}

/**
 * Performs all the logic of allocating device vectors and copying host/input
 * vectors to the device. Times the opposing_sort() kernel with wall time,
 * but excludes set up and tear down costs such as mallocs, frees, and memcpies.
 */
void run_gpu_soln( std::vector< element_t > data, std::size_t switch_at, std::size_t n )
{
    // Kernel launch configurations. Feel free to change these.
    // This is set to maximise the size of a thread block on a T4, but it hasn't
    // been tuned. It's not known if this is optimal.
    std::size_t const threads_per_block = 1024;
    std::size_t const num_blocks =  ( n + threads_per_block - 1 ) / threads_per_block;

    // Allocate arrays on the device/GPU
    element_t * d_data;
    cudaMalloc( (void**) & d_data, sizeof( element_t ) * n );
    CHECK_ERROR("Allocating input array on device");

    // Copy the input from the host to the device/GPU
    cudaMemcpy( d_data, data.data(), sizeof( element_t ) * n, cudaMemcpyHostToDevice );
    CHECK_ERROR("Copying input array to device");

    // Warm the cache on the GPU for a more fair comparison
    warm_the_gpu<<< num_blocks, threads_per_block>>>( d_data, switch_at, n );

    // Time the execution of the kernel that you implemented
    auto const kernel_start = std::chrono::high_resolution_clock::now();
    opposing_sort<<< num_blocks, threads_per_block>>>( d_data, switch_at, n );
    auto const kernel_end = std::chrono::high_resolution_clock::now();
    CHECK_ERROR("Executing kernel on device");

    // After the timer ends, copy the result back, free the device vector,
    // and echo out the timings and the results.
    cudaMemcpy( data.data(), d_data, sizeof( element_t ) * n, cudaMemcpyDeviceToHost );
    CHECK_ERROR("Transferring result back to host");
    cudaFree( d_data );
    CHECK_ERROR("Freeing device memory");

    std::cout << "GPU Solution time: "
              << std::chrono::duration_cast<std::chrono::nanoseconds>(kernel_end - kernel_start).count()
              << " ns" << std::endl;

    for( auto const x : data ) std::cout << x << " "; std::cout << std::endl;
}

} // namespace gpu
} // namespace a1
} // namespace csc485b

In [20]:
%%cuda_group_save -g "source" -n "main.cu"
/**
 * Driver for the benchmark comparison. Generates random data,
 * runs the CPU baseline, and then runs your code.
 */

#include <cstddef>  // std::size_t type
#include <iostream> // std::cout, std::endl
#include <vector>

#include "algorithm_choices.h"
#include "data_generator.h"
#include "data_types.h"
#include "cuda_common.h"

int main()
{
    std::size_t const n = 4;
    std::size_t const switch_at = 3 * ( n >> 2 ) ;

    auto data = csc485b::a1::generate_uniform< element_t >( n );
    csc485b::a1::cpu::run_cpu_baseline( data, switch_at, n );
    csc485b::a1::gpu::run_gpu_soln( data, switch_at, n );

    return EXIT_SUCCESS;
}

In [21]:
%cuda_group_run --group "source" --compiler-args "-O0 -g -std=c++20 -arch=sm_75"

CPU Baseline time: 923 ns
0 1 3 5 
GPU Solution time: 13754 ns
0 3 1 5 

