# Large Scale 3D Stencil SpMV on GPU (Tesla T4)

This notebook benchmarks a **3D 7-Point Stencil SpMV** kernel using Kokkos **TeamPolicy**.
It generates a shuffled grid (emulating unstructured mesh) and measures performance in GFLOPs on a GPU.

**Hardware:** NVIDIA Tesla T4 (Standard Colab GPU)
**Metric:** GFLOPs (Efficiency)

### Instructions:
1. Ensure Runtime is set to **GPU** (Runtime > Change runtime type > T4 GPU).
2. Run all cells below.

In [None]:
# 1. Check GPU
!nvidia-smi

In [None]:
# 2. Install Dependencies
import os
!apt-get install -y cmake build-essential
if not os.path.exists("kokkos"):
    !git clone https://github.com/kokkos/kokkos.git
    print("Kokkos Cloned successfully.")

In [None]:
# 3. Write Benchmark Code (3D Stencil + TeamPolicy)
source_code = r"""
#include <Kokkos_Core.hpp>
#include <Kokkos_Timer.hpp>
#include <vector>
#include <algorithm>
#include <random>
#include <cstdio>
#include <cstring> 

struct HostCSR {
    int num_rows;
    int num_nnz;
    std::vector<int> row_map;
    std::vector<int> col_idx;
    std::vector<double> values;
};

// --- GENERATOR 3D SHUFFLED ---
HostCSR generate_3d_stencil_shuffled(int nx, int ny, int nz) {
    int N = nx * ny * nz;
    HostCSR mat;
    mat.num_rows = N;
    std::vector<std::vector<int>> adj(N);
    
    // Direct Index Calculation
    for(int z=0; z<nz; z++) {
        for(int y=0; y<ny; y++) {
            for(int x=0; x<nx; x++) {
                int u = x + (y*nx) + (z*nx*ny);
                
                if(x>0)     adj[u].push_back( (x-1) + (y*nx) + (z*nx*ny) );
                if(x<nx-1)  adj[u].push_back( (x+1) + (y*nx) + (z*nx*ny) );
                
                if(y>0)     adj[u].push_back( x + ((y-1)*nx) + (z*nx*ny) );
                if(y<ny-1)  adj[u].push_back( x + ((y+1)*nx) + (z*nx*ny) );
                
                if(z>0)     adj[u].push_back( x + (y*nx) + ((z-1)*nx*ny) );
                if(z<nz-1)  adj[u].push_back( x + (y*nx) + ((z+1)*nx*ny) );
                
                adj[u].push_back(u); 
            }
        }
    }

    // SHUFFLE IDs 
    std::vector<int> p(N);
    for(int i=0; i<N; i++) p[i] = i;
    std::mt19937 rng(12345);
    std::shuffle(p.begin(), p.end(), rng);

    std::vector<int> inv_p(N);
    for(int i=0; i<N; i++) inv_p[p[i]] = i;

    mat.row_map.push_back(0);
    int current_nnz = 0;
    for(int i=0; i<N; i++) {
        int old_u = inv_p[i]; 
        std::vector<int> neighbors;
        for(int old_v : adj[old_u]) neighbors.push_back(p[old_v]); 
        std::sort(neighbors.begin(), neighbors.end());
        for(int col : neighbors) {
            mat.col_idx.push_back(col);
            mat.values.push_back(1.0);
            current_nnz++;
        }
        mat.row_map.push_back(current_nnz);
    }
    mat.num_nnz = current_nnz;
    return mat;
}

// --- GPU BENCHMARK ---
void run_benchmark(int grid_dim) {
    long long n_nodes = (long long)grid_dim * grid_dim * grid_dim;
    printf("Generating %d^3 Shuffled Grid (%lld Nodes)...\\n", grid_dim, n_nodes);
    HostCSR h_mat = generate_3d_stencil_shuffled(grid_dim, grid_dim, grid_dim);
    int N = h_mat.num_rows;
    int NNZ = h_mat.num_nnz;
    printf("Matrix Size: %d Rows, %d NNZ. Moving to Device...\\n", N, NNZ);

    typedef Kokkos::DefaultExecutionSpace::memory_space MemSpace;
    Kokkos::View<int*, MemSpace> row_map("row_map", N+1);
    Kokkos::View<int*, MemSpace> col_idx("col_idx", NNZ);
    Kokkos::View<double*, MemSpace> values("values", NNZ);
    Kokkos::View<double*, MemSpace> x("x", N);
    Kokkos::View<double*, MemSpace> y("y", N);

    // Host Mirrors
    auto h_row = Kokkos::create_mirror_view(row_map);
    auto h_col = Kokkos::create_mirror_view(col_idx);
    auto h_val = Kokkos::create_mirror_view(values);
    
    for(int i=0; i<=N; i++) h_row(i) = h_mat.row_map[i];
    for(int i=0; i<NNZ; i++) h_col(i) = h_mat.col_idx[i];
    for(int i=0; i<NNZ; i++) h_val(i) = h_mat.values[i];

    Kokkos::deep_copy(row_map, h_row);
    Kokkos::deep_copy(col_idx, h_col);
    Kokkos::deep_copy(values, h_val);
    Kokkos::deep_copy(x, 1.0); 

    typedef Kokkos::TeamPolicy<> policy_t;
    typedef policy_t::member_type member_t;
    
    // Warmup
    Kokkos::parallel_for("Warmup", policy_t(N, Kokkos::AUTO), KOKKOS_LAMBDA(const member_t& team) {
        if(team.league_rank() == 0) double temp = 0.0; 
    });
    Kokkos::fence();

    // Measurement
    Kokkos::Timer timer;
    int repeat = 20; 
    for(int iter=0; iter<repeat; iter++) {
        Kokkos::parallel_for("SpMV_GPU", policy_t(N, Kokkos::AUTO), KOKKOS_LAMBDA(const member_t& team) {
            int row = team.league_rank();
            double sum = 0.0;
            int start = row_map(row);
            int len = row_map(row+1) - start;
            
            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, len), 
                [=] (const int k_off, double& lsum) {
                    lsum += values(start + k_off) * x(col_idx(start + k_off));
                }, sum);
            
            if(team.team_rank()==0) y(row) = sum;
        });
    }
    Kokkos::fence();
    
    double avg_time = timer.seconds() / repeat;
    double gflops = (2.0 * NNZ * 1e-9) / avg_time;
    printf(">>> Result: %d^3 | Time: %.5f s | Perf: %.2f GFLOPs\\n\\n", grid_dim, avg_time, gflops);
}

int main(int argc, char* argv[]) {
    Kokkos::initialize(argc, argv);
    {
        run_benchmark(50);   // 125k
        run_benchmark(80);   // 512k
        run_benchmark(100);  // 1M
    }
    Kokkos::finalize();
    return 0;
}
"""
with open("benchmark_gpu.cpp", "w") as f:
    f.write(source_code)

In [None]:
# 4. Compile with NVCC Wrapper
print("=== COMPILING ===")
import os
cwd = os.getcwd()
nvcc_wrapper_path = os.path.join(cwd, "kokkos/bin/nvcc_wrapper")
!chmod +x {nvcc_wrapper_path}

# Build Kokkos Library
!mkdir -p build && cd build && \
 cmake ../kokkos \
 -DKokkos_ENABLE_CUDA=ON \
 -DKokkos_ENABLE_CUDA_LAMBDA=ON \
 -DCMAKE_CXX_COMPILER={nvcc_wrapper_path} \
 -DCMAKE_BUILD_TYPE=Release && \
 make -j2

# Build Benchmark App
with open("CMakeLists.txt", "w") as f:
    f.write("""
cmake_minimum_required(VERSION 3.16)
project(BenchGL CXX)
find_package(Kokkos REQUIRED PATHS build)
add_executable(bench benchmark_gpu.cpp)
target_link_libraries(bench Kokkos::kokkos)
""")

print("=== LINKING ===")
!cmake . -DCMAKE_CXX_COMPILER={nvcc_wrapper_path} -DKokkos_DIR=/content/build
!make bench

In [None]:
# 5. Execute
print("=== RUNNING BENCHMARK ===")
!./bench