In [1]:
! pip install ninja
! apt update
! apt install g++-11 -y
# 编译缓存的包，加速编译
! apt install ccache -y

Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease                         [0m[33m
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]      [0m
Get:1 https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1581 B]
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]       [0mm[33m
Get:5 https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64  Packages [1607 kB]
Hit:6 https://ppa.launchpadcontent.net/xmake-io/xmake/ubuntu jammy InRelease   [0m
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]      [0m[33m[33m[33m[33m[33m[33m
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1544 kB]
Get:9 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [3155 kB]
Fetched 6692 kB in 4s (1643 kB/s)  [0m                  [0m[33m[33m[33m
Reading package lists... Done
Building dependency tree... Done
Reading state information... D

In [2]:
import torch
import torch.utils.cpp_extension
import os
os.environ['CXX'] = '/usr/lib/ccache/g++-11'
os.environ['CC'] = '/usr/lib/ccache/gcc-11'

In [3]:
# based on Jeremy's Lecture 3 notebook
cuda_begin = r'''
#include <torch/extension.h>
#include <stdio.h>
#include <c10/cuda/CUDAException.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}
'''

cuda_src = cuda_begin + r'''
__global__ void rgb_to_grayscale_kernel(unsigned char* out, unsigned char* in, int n) {
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    if (i >= n) return;
    out[i] = 0.2989f*in[i] + 0.5870f*in[i+n] + 0.1140f*in[i+2*n];  // fix with f found by Andreas...
}

torch::Tensor rgb_to_grayscale_out(torch::Tensor output, const torch::Tensor& input) {
    CHECK_INPUT(input);
    int h = input.size(1);
    int w = input.size(2);
    TORCH_CHECK((h == output.size(0)) || (w == output.size(1)) || (output.device() == input.device())
                || (output.scalar_type() == input.scalar_type()));
    int threads = 256;
    rgb_to_grayscale_kernel<<<cdiv(w*h,threads), threads>>>(
        output.data_ptr<unsigned char>(), input.data_ptr<unsigned char>(), w*h);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    return output;
}

torch::Tensor rgb_to_grayscale(const torch::Tensor& input) {
    CHECK_INPUT(input);
    int h = input.size(1);
    int w = input.size(2);
    auto output = torch::empty({h,w}, input.options());
    rgb_to_grayscale_out(output, input);
    return output;
}
'''

cpp_src = """
torch::Tensor rgb_to_grayscale(const torch::Tensor& input);
torch::Tensor rgb_to_grayscale_out(torch::Tensor outpuit, const torch::Tensor& input);
"""

import os
os.environ['CXX'] = '/usr/lib/ccache/g++-11'
os.environ['CC'] = '/usr/lib/ccache/gcc-11'

module = torch.utils.cpp_extension.load_inline(
    "test_ext", cpp_src, cuda_src,
    functions=['rgb_to_grayscale', 'rgb_to_grayscale_out'], extra_cuda_cflags=['--ptxas-options=-v'], verbose=True)



n = 2048
t = torch.randint(0, 256, (3, n, n), dtype=torch.uint8, device="cuda")
out = module.rgb_to_grayscale(t); torch.cuda.synchronize()

import time
t0 = time.perf_counter_ns()
for i in range(10_000):
    module.rgb_to_grayscale_out(out, t)
torch.cuda.synchronize()
t1 = time.perf_counter_ns()

print((t1-t0) / 10_000 / 1_000, "µs")


with torch.profiler.profile() as prof:
    for i in range(10_000):
        module.rgb_to_grayscale_out(out, t)
        torch.cuda.synchronize()

print(prof.key_averages().table())


Using /root/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...
Creating extension directory /root/.cache/torch_extensions/py311_cu124/test_ext...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu124/test_ext/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module test_ext...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/3] /usr/lib/ccache/g++-11 -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=test_ext -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /opt/conda/lib/python3.11/site-packages/torch/include -isystem /opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/lib/python3.11/site-packages/torch/include/TH -isystem /opt/conda/lib/python3.11/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -c /root/.cache/torch_extensions/py311_cu124/test_ext/main.cpp -o main.o 
[2/3] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output cuda.cuda.o.d -ccbin /usr/lib/ccache/gcc-11 -DTORCH_EXTENSION_NAME=test_ext -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /opt

Loading extension module test_ext...


83.47746339999999 µs
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       cudaLaunchKernel        10.43%      83.812ms        10.43%      83.812ms       8.381us       0.000us         0.00%       0.000us       0.000us         10000  
rgb_to_grayscale_kernel(unsigned char*, unsigned cha...         0.00%       0.000us         0.00%       0.000us       0.000us     878.432ms       100.00%     878.432ms      87.843us     

Bad pipe message: %s [b' Intel Mac OS X 10.15; rv:137.0) Gecko/20100101 Firefox/137.0\r\nAccept: text/html,application/xhtml+xml,', b'plication/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0']
Bad pipe message: %s [b',en;q=0.2\r\nAccept-Encoding: gzip, deflate, br', b'zstd\r\nConnection: keep-alive\r\nUpgrade-Insec']
Bad pipe message: %s [b'e-Requests: 1\r\nSec-Fetch-Dest: document\r\nSec-Fetch-Mode: navigate\r\nSec-Fetch-Site: none\r\nSec-Fetch-User: ?1\r\nPriorit']
Bad pipe message: %s [b' Intel Mac OS X 10.15; rv:137.0) Gecko/20100101 Firefox/137.0\r\nAccept: text/html,application/xhtml+xml,', b'plication/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0']
Bad pipe message: %s [b',en;q=0.2\r\nAccept-Encoding: gzip, deflate, br', b'zstd\r\nConnection: keep-alive\r\nUpgrade-Insec']
Bad pipe message: %s [b'e-Requests: 1\r\nSec-Fetch-Dest: document\r\nSec-Fetch-Mode: navigate\r\nSec-Fetch-Site: none\r\nSec-