<a href="https://colab.research.google.com/github/macsyd/GPU-Computing/blob/main/Assignment2/csc485b_assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load the extension that allows us to compile CUDA code in python notebooks
# Documentation is here: https://nvcc4jupyter.readthedocs.io/en/latest/
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc4jupyter

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-korih8nn
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-korih8nn
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 28f872a2f99a1b201bcd0db14fdbc5a496b9bfd7
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


In [None]:
%%cuda_group_save -g "source" -n "data_types.h"
/**
 * A collection of commonly used data types throughout this project.
 */
#pragma once

#include <iostream> // for std::ostream
#include <vector>

namespace csc485b{
namespace a2{

using node_t = int;
using edge_t = int2;

using edge_list_t = std::vector< edge_t >;
using node_list_t = std::vector< node_t >;

} // namespace a2
} // namespace csc485b


In [None]:
%%cuda_group_save -g "source" -n "cuda_common.h"
/**
 * Standard macros that can be useful for error checking.
 * https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__ERROR.html
 */
#pragma once

#include <cuda.h>

#define CUDA_CALL(exp)                                       \
    do {                                                     \
        cudaError res = (exp);                               \
        if(res != cudaSuccess) {                             \
            printf("Error at %s:%d\n %s\n",                  \
                __FILE__,__LINE__, cudaGetErrorString(res)); \
           exit(EXIT_FAILURE);                               \
        }                                                    \
    } while(0)

#define CHECK_ERROR(msg)                                             \
    do {                                                             \
        cudaError_t err = cudaGetLastError();                        \
        if(cudaSuccess != err) {                                     \
            printf("Error (%s) at %s:%d\n %s\n",                     \
                (msg), __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE);                                      \
        }                                                            \
    } while (0)

In [None]:
%%cuda_group_save -g "source" -n "data_generator.h"
/**
 * Functions for generating random input data with a fixed seed
 */
#pragma once

#include <cassert>  // for assert()
#include <cstddef>  // std::size_t type
#include <random>   // for std::mt19937, std::uniform_int_distribution
#include <vector>

#include "data_types.h"

namespace csc485b {
namespace a2 {

/**
 * Generates and returns a vector of random edges
 * for a graph `G=(V,E)` with `n=|V|=n` and expected `m=|E|`.
 * Referred to as an Erdős-Rényi graph.
 *
 * @see https://networkx.org/documentation/stable/reference/generated/networkx.generators.random_graphs.fast_gnp_random_graph.html#networkx.generators.random_graphs.fast_gnp_random_graph
 */
edge_list_t generate_graph( std::size_t n, std::size_t m )
{
    assert( "At most n(n-1) edges in a simple graph" && m < n * ( n - 1 ) );

    int const probability = ( 100 * m ) / ( n * ( n - 1 ) );

    // for details of random number generation, see:
    // https://en.cppreference.com/w/cpp/numeric/random/uniform_int_distribution
    std::size_t random_seed = 20241008;  // use magic seed
    std::mt19937 rng( random_seed );     // use mersenne twister generator
    std::uniform_int_distribution<> distrib(0, 100);

    edge_list_t random_edges;
    random_edges.reserve( 2 * m );

    for( node_t u = 0; u < n; ++u )
    {
        for( node_t v = u + 1; v < n; ++v )
        {
            auto const dice_roll = distrib( rng );
            if( dice_roll <= probability )
            {
                random_edges.push_back( make_int2( u, v ) );
                random_edges.push_back( make_int2( v, u ) );
            }
        }
    }

    random_edges.resize( random_edges.size() );


    return random_edges;
}

 /**
  * Generate a simple graph with adjacencyMatrix and Two Hop for testing
 * that should be easily verifiable as the nodes make a circle only.
 */
edge_list_t generate_circular_graph(std::size_t n, std::size_t m, std::size_t k) {
    // Ensure the graph is simple
    assert(k <= n - 1 && "Each node can connect to at most (n-1) other nodes.");

    edge_list_t edge_list;
    edge_list.reserve( 2 * m );
    for (std::size_t i = 0; i < n; ++i) {
        for (std::size_t j = 1; j <= k; ++j) {
            std::size_t target = (i + j) % n; // Circular connection

            // To prevent duplicate edges, add only if i < target
            if (i < target) {
                edge_list.push_back(make_int2(i, target));
                edge_list.push_back(make_int2(target, i));
            }
        }
    }
    edge_list.resize(edge_list.size());

    return edge_list;
}

} // namespace a2
} // namespace csc485b


In [None]:
%%cuda_group_save -g "source" -n "dense_graph.h"
/**
 * The file in which you will implement your DenseGraph GPU solutions!
 */

#include <cstddef>  // std::size_t type

#include "cuda_common.h"
#include "data_types.h"

namespace csc485b {
namespace a2      {

/**
 * A DenseGraph is optimised for a graph in which the number of edges
 * is close to n(n-1). It is represented using an adjacency matrix.
 */
struct DenseGraph
{
  std::size_t n; /**< Number of nodes in the graph. */
  node_t * adjacencyMatrix; /** Pointer to an n x n adj. matrix */

  /** Returns number of cells in the adjacency matrix. */
  __device__ __host__ __forceinline__
  std::size_t matrix_size() const { return n * n; }
};


namespace gpu {


/**
 * Constructs a DenseGraph from an input edge list of m edges.
 *
 * @pre The pointers in DenseGraph g have already been allocated.
 */
__global__
void build_graph( DenseGraph g, edge_t const * edge_list, std::size_t m )
{
    // IMPLEMENT ME!

    // set indices (i,j) and (j,i) to 1 in the adjacency matrix (g.adajcencyMatrix[i][j] and  g.adajcencyMatrix[j][i]) for each edge (i,j) in edge_list
    // every other entry gets set to 0
    int const th_id = blockIdx.x * blockDim.x + threadIdx.x;

    if (th_id < g.n * g.n) {
        g.adjacencyMatrix[th_id] = 0;
    }

    if (th_id < m) {
        edge_t my_edge = edge_list[th_id];
        int i = my_edge.x;
        int j = my_edge.y;

        g.adjacencyMatrix[i * (g.n) + j] = 1;
        //g.adjacencyMatrix[j * (g.n) + i] = 1; //not needed because edg (i,j) and (j,i) both appear in edge list
    }

    return;
}

/**
  * Repopulates the adjacency matrix as a new graph that represents
  * the two-hop neighbourhood of input graph g
  */
 __global__
 void one_block_solution(DenseGraph g, std::size_t const num_blocks, std::size_t const threads_per_block) {
    int const th_id = blockIdx.x * blockDim.x + threadIdx.x;
    int matrix_size = g.n * g.n;
    if (th_id < matrix_size) {
        int row = th_id / g.n;
        int col = th_id % g.n;
        int temp_sum = 0;
        // working solution without tiling
        for(int i = 0; i < g.n; i++)
        {
          temp_sum += g.adjacencyMatrix[row * g.n + i] * g.adjacencyMatrix[i * g.n + col];
        }
        // Clamp to [0,1]-> if greater than 0, set to 1, else set to 0
        node_t clamped_val = (temp_sum > 0) ? 1 : 0;
        if (row == col) {
            //ignore diagonals
            clamped_val = 0;
        }
        g.adjacencyMatrix[th_id] = clamped_val;
    }
    return;
 }

 __global__
void more_active_threads(DenseGraph g, std::size_t const num_blocks, std::size_t const threads_per_block, int* buffer) {
    int const th_idx = blockIdx.x * blockDim.x + threadIdx.x;
    int const th_idy = blockIdx.x * blockDim.x + threadIdx.y;
    int const th_idz = blockIdx.x * blockDim.x + threadIdx.z;
    int my_prod = 0;
    if (th_idx < g.n && th_idy < g.n && th_idz < g.n) {
        //printf("inside my_prod\n");
        my_prod = g.adjacencyMatrix[g.n * th_idy + th_idz] * g.adjacencyMatrix[th_idz * g.n  + th_idx];
    }

    __syncthreads();

    if (th_idx < g.n && th_idy < g.n && th_idz < g.n) {
        //printf("whos in here: x-%d y-%d z-%d; what they're doin: %d in %d\n", th_idx, th_idy, th_idz, my_prod, th_idx  + g.n * th_idy);
        atomicAdd(&buffer[th_idx  + g.n * th_idy], my_prod);
    }
}

 __global__
 void copy_buffer_to_matrix(DenseGraph g, int* two_hop_buffer) {
    int const th_id = blockIdx.x * blockDim.x + threadIdx.x;
    if (th_id < g.n * g.n) {
        g.adjacencyMatrix[th_id] = two_hop_buffer[th_id];
    }
}

void two_hop_reachability( DenseGraph g, std::size_t const num_blocks, std::size_t const threads_per_block )
{
    printf("adjacency matrix gpu:\n");
    std::vector< a2::node_t > host_matrix( g.matrix_size() );
    a2::DenseGraph dg{ g.n, host_matrix.data() };
    cudaMemcpy( dg.adjacencyMatrix, g.adjacencyMatrix, sizeof( a2::node_t ) * g.matrix_size(), cudaMemcpyDeviceToHost );
    for (int i = 0; i < g.n; i++) {
        for (int j = 0; j < g.n; j++) {
            printf("%d ", dg.adjacencyMatrix[i * g.n + j]);
        }
        printf("\n");
    }

    /* for testing
    printf("two-hop matrix cpu:\n");
    int two_hop_matrix[g.n * g.n];
    for (int i = 0; i < g.n; i++) {
        for (int j = 0; j < g.n; j++) {
            int temp_sum = 0;
            for (int k = 0; k < g.n; k++) {
                temp_sum += dg.adjacencyMatrix[i * g.n + k] * dg.adjacencyMatrix[k * g.n + j];
            }
            two_hop_matrix[j * g.n + i] = temp_sum;
        }
    }
    for (int i = 0; i < g.n; i++) {
        for (int j = 0; j < g.n; j++) {
            printf("%d ", two_hop_matrix[i * g.n + j]);
        }
        printf("\n");
    }
    */

    // This solution doesn't seem to work beyond 1 block... -- maybe it does?
    csc485b::a2::gpu::one_block_solution<<< num_blocks, threads_per_block >>>( g, num_blocks, threads_per_block );

    /*
    std::size_t const new_threads_per_block = 32;
    std::size_t const new_num_blocks =  g.n / new_threads_per_block + 1; // use n^3 threads per output cell

    //buffer for two-hop adj matrix
    int two_hop_buffer[g.n * g.n];
    int *gpu_two_hop;
    int size = g.n * g.n * sizeof(int);
    for (int i = 0; i < (g.n * g.n); i++) {
        two_hop_buffer[i] = 0;
    }
    cudaMalloc((void **)&gpu_two_hop, size);
    cudaMemcpy(gpu_two_hop, two_hop_buffer, size, cudaMemcpyHostToDevice);

    more_active_threads<<< new_num_blocks, dim3{new_threads_per_block, new_threads_per_block, threads_per_block} >>>( g, new_num_blocks, new_threads_per_block, gpu_two_hop );

    cudaDeviceSynchronize();
    //csc485b::a2::gpu::copy_buffer_to_matrix<<< new_num_blocks, new_threads_per_block >>>(g, gpu_two_hop);

    cudaMemcpy(g.adjacencyMatrix, gpu_two_hop, size, cudaMemcpyDeviceToHost);
    cudaFree(gpu_two_hop);
    */

}

} // namespace gpu
} // namespace a2
} // namespace csc485b

In [None]:
%%cuda_group_save -g "source" -n "sparse_graph.h"
/**
 * The file in which you will implement your SparseGraph GPU solutions!
 */

#include <cstddef>  // std::size_t type

#include "cuda_common.h"
#include "data_types.h"

namespace csc485b {
namespace a2      {

/**
 * A SparseGraph is optimised for a graph in which the number of edges
 * is close to cn, for a small constanct c. It is represented in CSR format.
 */
struct SparseGraph
{
  std::size_t n; /**< Number of nodes in the graph. */
  std::size_t m; /**< Number of edges in the graph. */
  node_t * neighbours_start_at; /** Pointer to an n=|V| offset array */
  node_t * neighbours; /** Pointer to an m=|E| array of edge destinations */
};


namespace gpu {
/**
 * Helper function to build the neighbours_start_at array from
 * the sum of the edges between a given node
*/
__global__
void prefix_sum(SparseGraph g, edge_t const * edge_list, std::size_t m) {
    // run a prefix sum to get running counts of number of threads' neighbours
    // create shared mem for double buffering
    int const th_id = blockIdx.x * blockDim.x + threadIdx.x;
    int constexpr shared_memory_size = 1024;
    __shared__ node_t smem[ shared_memory_size ];
    if(th_id < g.n) {
        smem[th_id] = g.neighbours_start_at[th_id];
    }
    __syncthreads();

    for(int stride = 1; stride < g.n; stride++) { //(g.n >> 1) = log2f(g.n)
        if(th_id < g.n) {
            if(th_id > stride) {
                smem[th_id] += g.neighbours_start_at[th_id - stride];
            }
        }
    }
    __syncthreads();
    if(th_id < g.n) {
        g.neighbours_start_at[th_id] = smem[th_id];
    }
}
/**
 * Helper function to build the neighbours_start_at array from
 * the sum of the edges between a given node
*/
__global__
void merge_sums(SparseGraph g, int stride) {
    // add previous prefix sum blocks so whole neighbours_start_at array has prefix sum
    // create shared memory for double buffering
    int const th_id = blockIdx.x * blockDim.x + threadIdx.x;
    int constexpr shared_memory_size = 1024;
    __shared__ node_t smem[ shared_memory_size ];
    if(th_id < g.n) {
        smem[th_id] = g.neighbours_start_at[th_id];
    }
    __syncthreads();

    smem[th_id] += g.neighbours_start_at[th_id - g.n*stride + (g.n - (th_id % g.n) - 1)];
    __syncthreads();

    if(th_id < g.n) {
        g.neighbours_start_at[th_id] = smem[th_id];
    }
}

__global__
void fill_array(SparseGraph g, edge_t const * edge_list, std::size_t m)
{
    int const th_id = blockIdx.x * blockDim.x + threadIdx.x;
    if (th_id < g.n) {
        g.neighbours_start_at[th_id] = 0;
    }

    if (th_id < m) {
        edge_t my_edge = edge_list[th_id];
        int i = my_edge.x;
        if (i+1 < m) atomicAdd(&g.neighbours_start_at[i+1], 1);
    }
    __syncthreads();
}

__global__
void fill_neighbours(SparseGraph g, edge_t const * edge_list, std::size_t m, int * offset_array)
{
    int const th_id = blockIdx.x * blockDim.x + threadIdx.x;
    edge_t my_edge = edge_list[th_id];
    int i = my_edge.x;
    int j = my_edge.y;
    //int chunk_start = g.neighbours_start_at[i];
    //printf("fill_neighbours thread %d with m = %d\n", th_id, m);

    int offset = 0;

    //fill with appropriate values
    if (th_id < m) {
        offset = atomicAdd(&offset_array[i], 1);
        g.neighbours[g.neighbours_start_at[i] + offset] = j;
        //printf("node %d inserted neighbour %d at index %d; offset = %d\n", i, j, g.neighbours_start_at[i] + offset, offset);
    }
}
/**
 * Constructs a SparseGraph from an input edge list of m edges.
 *
 * @pre The pointers in SparseGraph g have already been allocated.
 */
void build_graph( SparseGraph g, edge_t const * edge_list, std::size_t m, std::size_t const num_blocks, std::size_t const threads_per_block )
{
    // IMPLEMENT ME!

    // fill neighbours_start_at with number of neighbours
    fill_array<<< num_blocks, threads_per_block >>>(g, edge_list, m);

    // device function (above) for prefix sum
    prefix_sum<<< num_blocks, threads_per_block >>>(g, edge_list, m);

    //for mulitple thread blocks:
    //merge sums in increasing strides
    int thread_block_count = g.n/8; // num thread blocks - should this be hard coded or passed in?
    for(int stride = 1; stride < thread_block_count/2; stride <<= 1) {
        merge_sums<<< 1, num_blocks >>>(g, stride);
        cudaDeviceSynchronize();
    }

    // use neighbours_start_at entries when filling in the neighbours array to know which "chunk" to put the neighbours in
    // one thread per edge, use atomic adds on global offset array to keep track of edge counts within chunks
    // used code from https://stackoverflow.com/questions/13275145/how-i-use-global-memory-correctly-in-cuda to set up global array
    int cpu_offset_array[g.n];
    int *gpu_offset_array;
    int size = g.n * sizeof(int);
    //initialize offset array
    for (int i = 0; i < g.n; i++) {
        cpu_offset_array[i] = 0;
    }
    cudaMalloc((void **)&gpu_offset_array, size);
    cudaMemcpy(gpu_offset_array, cpu_offset_array, size, cudaMemcpyHostToDevice);

    fill_neighbours<<< g.m/threads_per_block + 1, threads_per_block >>>(g, edge_list, m, gpu_offset_array);

    cudaFree(gpu_offset_array);
}

/**
  * Repopulates the adjacency lists as a new graph that represents
  * the two-hop neighbourhood of input graph g
  */
__global__
void two_hop_reachability( SparseGraph g )
{
    // IMPLEMENT ME!
    // algorithm unknown
    return;
}

} // namespace gpu
} // namespace a2
} // namespace csc485b

In [None]:
%%cuda_group_save -g "source" -n "main.cu"
/**
 * Driver for the benchmark comparison. Generates random data,
 * runs the CPU baseline, and then runs your code.
 */

#include <chrono>   // for timing
#include <iostream> // std::cout, std::endl
#include <iterator> // std::ostream_iterator
#include <vector>

#include "dense_graph.h"
#include "sparse_graph.h"

#include "data_generator.h"
#include "data_types.h"

/**
 * Runs timing tests on a CUDA graph implementation.
 * Consists of independently constructing the graph and then
 * modifying it to its two-hop neighbourhood.

 * Allocates space for a dense graph and then runs the test code on it.
 */
void run_dense( csc485b::a2::edge_t const * d_edges, std::size_t n, std::size_t m )
{
    using namespace csc485b;

    // allocate device DenseGraph
    a2::node_t * d_matrix;
    cudaMalloc( (void**)&d_matrix, sizeof( a2::node_t ) * n * n );
    a2::DenseGraph d_dg{ n, d_matrix };
    printf("Producing Dense Graph and Two Hop Reachability:\n");
    //run( d_dg, d_edges, m );

    cudaDeviceSynchronize();
    auto const build_start = std::chrono::high_resolution_clock::now();

    //kernel launch configs
    std::size_t const threads_per_block = 1024;
    std::size_t const num_blocks =  m / threads_per_block + 1; // use one thread per edge

    //build 1d adj matrix
    csc485b::a2::gpu::build_graph<<< num_blocks, threads_per_block >>>( d_dg, d_edges, m );

    cudaDeviceSynchronize();
    auto const reachability_start = std::chrono::high_resolution_clock::now();

    // neither does this!
    csc485b::a2::gpu::two_hop_reachability( d_dg, num_blocks, threads_per_block );

    cudaDeviceSynchronize();
    auto const end = std::chrono::high_resolution_clock::now();

    std::cout << "Build time: "
              << std::chrono::duration_cast<std::chrono::microseconds>(reachability_start - build_start).count()
              << " us"
              << std::endl;

    std::cout << "Reachability time: "
              << std::chrono::duration_cast<std::chrono::microseconds>(end - reachability_start).count()
              << " us"
              << std::endl;

    // check output
    std::vector< a2::node_t > host_matrix( d_dg.matrix_size() );
    a2::DenseGraph dg{ n, host_matrix.data() };
    cudaMemcpy( dg.adjacencyMatrix, d_dg.adjacencyMatrix, sizeof( a2::node_t ) * d_dg.matrix_size(), cudaMemcpyDeviceToHost );
    //std::copy( host_matrix.cbegin(), host_matrix.cend(), std::ostream_iterator< a2::node_t >( std::cout, " " ) );
    for (int i = 0; i < dg.n; i++) {
        for (int j = 0; j < dg.n; j++) {
            printf("%d ", dg.adjacencyMatrix[i * dg.n + j]);
        }
        printf("\n");
    }

    // clean up
    cudaFree( d_matrix );
}

/**
 * Allocates space for a sparse graph and then runs the test code on it.
 */
void run_sparse( csc485b::a2::edge_t const * d_edges, std::size_t n, std::size_t m )
{
    using namespace csc485b;

    // allocate device SparseGraph
    a2::node_t * d_offsets, * d_neighbours;
    cudaMalloc( (void**)&d_offsets,    sizeof( a2::node_t ) * n );
    cudaMalloc( (void**)&d_neighbours, sizeof( a2::node_t ) * m );
    a2::SparseGraph d_sg{ n, m, d_offsets, d_neighbours };
    printf("\nProducing Sparse Graph and Two Hop Reachability:\n");
    //run( d_sg, d_edges, m );

    cudaDeviceSynchronize();
    auto const build_start = std::chrono::high_resolution_clock::now();

    //kernel launch configs
    std::size_t const threads_per_block = 4;
    std::size_t const num_blocks =  m / threads_per_block + 1; // use one thread per edge

    csc485b::a2::gpu::build_graph(d_sg, d_edges, m, num_blocks, threads_per_block);

    cudaDeviceSynchronize();
    auto const reachability_start = std::chrono::high_resolution_clock::now();

    csc485b::a2::gpu::two_hop_reachability<<< num_blocks, threads_per_block >>>( d_sg );

    cudaDeviceSynchronize();
    auto const end = std::chrono::high_resolution_clock::now();

    std::cout << "Build time: "
              << std::chrono::duration_cast<std::chrono::microseconds>(reachability_start - build_start).count()
              << " us"
              << std::endl;

    std::cout << "Reachability time: "
              << std::chrono::duration_cast<std::chrono::microseconds>(end - reachability_start).count()
              << " us"
              << std::endl;

    // check output
    std::vector< a2::node_t > host_matrix_sparse_neighbours( d_sg.m );
    std::vector< a2::node_t > host_matrix_neighbours_start( d_sg.n );
    a2::SparseGraph sg{ n, m, host_matrix_neighbours_start.data(), host_matrix_sparse_neighbours.data() };
    cudaMemcpy( sg.neighbours_start_at, d_sg.neighbours_start_at, sizeof( a2::node_t ) * d_sg.n, cudaMemcpyDeviceToHost );
    std::copy( host_matrix_neighbours_start.cbegin(), host_matrix_neighbours_start.cend(), std::ostream_iterator< a2::node_t >( std::cout, " " ) );

    printf("\n");

    cudaMemcpy( sg.neighbours, d_sg.neighbours, sizeof( a2::node_t ) * d_sg.m, cudaMemcpyDeviceToHost );
    std::copy( host_matrix_sparse_neighbours.cbegin(), host_matrix_sparse_neighbours.cend(), std::ostream_iterator< a2::node_t >( std::cout, " " ) );

    // clean up
    cudaFree( d_neighbours );
    cudaFree( d_offsets );
}

int main()
{
    using namespace csc485b;

    // Create input
    std::size_t constexpr n = 8;
    std::size_t constexpr expected_degree = n >> 1;

    a2::edge_list_t const graph = a2::generate_graph( n, n * expected_degree );
    std::size_t const m = graph.size();

    // lazily echo out input graph
    for( auto const& e : graph )
    {
        std::cout << "(" << e.x << "," << e.y << ") ";
    }
    std::cout << std::endl;
    // allocate and memcpy input to device
    a2::edge_t * d_edges;
    cudaMalloc( (void**)&d_edges, sizeof( a2::edge_t ) * m );
    cudaMemcpyAsync( d_edges, graph.data(), sizeof( a2::edge_t ) * m, cudaMemcpyHostToDevice );

    // run your code!
    run_dense ( d_edges, n, m );
    run_sparse( d_edges, n, m );

    return EXIT_SUCCESS;
}

In [None]:
%cuda_group_run --group "source" --compiler-args "-O3 -g -std=c++20 -arch=sm_75"

(0,2) (2,0) (0,3) (3,0) (0,4) (4,0) (0,5) (5,0) (0,6) (6,0) (1,3) (3,1) (1,4) (4,1) (1,5) (5,1) (1,7) (7,1) (2,3) (3,2) (2,5) (5,2) (3,4) (4,3) (3,5) (5,3) (3,6) (6,3) (4,5) (5,4) (4,6) (6,4) (4,7) (7,4) (5,6) (6,5) (5,7) (7,5) (6,7) (7,6) 
Producing Dense Graph and Two Hop Reachability:
adjacency matrix gpu:
0 0 1 1 1 1 1 0 
0 0 0 1 1 1 0 1 
1 0 0 1 0 1 0 0 
1 1 1 0 1 1 1 0 
1 1 0 1 0 1 1 1 
1 1 1 1 1 0 1 1 
1 0 0 1 1 1 0 1 
0 1 0 0 1 1 1 0 
Build time: 221 us
Reachability time: 53 us
0 1 1 1 1 1 1 1 
1 0 1 1 1 1 1 1 
1 1 0 1 1 1 1 1 
1 1 1 0 1 1 1 1 
1 1 1 1 0 1 1 1 
1 1 1 1 1 0 1 1 
1 1 1 1 1 1 0 1 
1 1 1 1 1 1 1 0 

Producing Sparse Graph and Two Hop Reachability:
Build time: 71 us
Reachability time: 20 us
0 5 9 12 18 24 31 36 
6 4 5 2 3 3 4 5 7 5 0 3 1 5 6 4 0 2 5 6 0 7 1 3 4 3 0 6 1 2 7 4 0 3 5 7 4 5 1 6 



want this for csr:<br>
neighbours_start_at = [0, 2, 4, 7]<br>
neighbours          = [2, 3, 2, 3, 0, 1, 3, 0, 1, 2]<br>
for n=8 (different graph than the one currently being generated?):<br>
neighbours_start_at = [0, 5, 9, 12, 18, ...]<br>
neighbours          = [2, 3, 4, 5, 6, 3, 4, 5, 7, 0, 3, 5, 0, 1, 4, 5, 6, ...]