# Softmax (1D)



In [1]:
import sys, os
from pathlib import Path

# Add the parent directory of the current notebook to sys.path
cur_dir = Path().resolve()
parent_dir = cur_dir.parent
sys.path += [str(parent_dir), str(cur_dir)]


os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [2]:
import torch
from utils import cdiv, get_sig, load_cuda, profile_kernel
from collections import namedtuple
from softmax_py import  softmax_py

## Python cuda looking implementation

In [3]:

V = torch.randn(32, dtype=torch.float32)
O = softmax_py(V)
O_torch = torch.softmax(V, dim=0)
torch.allclose(O, O_torch, atol=1e-4)


True

# Cuda 
## naive and unsafe

In [4]:
def get_modules(kernels):
    for kernel_name, kernel_data in kernels.items():

        fname = kernel_data["fname"]
        cuda_source = Path(kernel_data["cuda_source_path"]).read_text()
        cpp_source = get_sig(fname, cuda_source)
        module = load_cuda(cuda_source, cpp_source, funcs=[fname])
        kernel_data["module"] = module


def get_softmax_modules(kernels):
    get_modules(kernels)
    kernels["torch"] = {
        "module": torch,
        "fname": "softmax",
    }

def add_args_kwargs(kernels, *args, **kwargs):
    for kernel_name, kernel_data in kernels.items():
        kernel_data["args"]= args
        if kernel_name == "torch":
            kernel_data["kwargs"] = kwargs
        else: kernel_data["kwargs"] = {}


In [5]:
kernels = {
    "softmax_naive": dict(cuda_source_path = "./softmax_naive.cu", fname = "softmax_naive"),
    "softmax_fast": dict(cuda_source_path = "./softmax_fast.cu", fname = "softmax"),
    "softmax_tiled": dict(cuda_source_path = "./softmax_tiled.cu", fname = "softmax_tiled"),
}
get_softmax_modules(kernels)
add_args_kwargs(kernels, V, dim=0)


If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


## Profile

In [None]:
def test_allclose(kernels):
    D = 1024
    V_test = torch.randn(D).contiguous().cuda()
    O_torch = torch.softmax(V_test,  dim=0)
    for kernel_name, kernel_data in kernels.items():
        if kernel_name!="torch":
            module, fname = kernel_data["module"], kernel_data["fname"]
            O = getattr(module, fname)(V_test)
            if not torch.allclose(O, O_torch, atol=1e-4):
                raise ValueError(f"{kernel_name=} failed:\n\n {O[:10]=}, {O_torch[:10]=}")
            print(f"{kernel_name=} agrees with torch softmax")
        


def profile_kernels(kernels):
    test_allclose(kernels)
    for kernel_name, kernel_data in kernels.items():
        print(f"Profiling: {kernel_name}")
        profile_kernel(kernel_data["module"], kernel_data["fname"], *kernel_data["args"], **kernel_data["kwargs"])

profile_kernels(kernels)


kernel_name='softmax_naive' agrees with torch softmax
kernel_name='softmax_fast' agrees with torch softmax


ValueError: kernel_name='softmax_tiled' failed:

 O[:10]=tensor([4.9065e-04, 1.4244e-04, 1.1706e-04, 1.3123e-04, 9.6207e-04, 1.3519e-04,
        4.4394e-04, 1.2288e-04, 3.3757e-05, 2.5842e-05], device='cuda:0'), O_torch[:10]=tensor([0.0022, 0.0006, 0.0005, 0.0006, 0.0042, 0.0006, 0.0020, 0.0005, 0.0001,
        0.0001], device='cuda:0')

In [None]:
torch.randn(1024).contiguous().cuda()

RuntimeError: CUDA error: an illegal memory access was encountered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
D = 1024
V = torch.randn(D).contiguous().cuda()
O = getattr(module, fname)(V)
O_torch = torch.softmax(V,  dim=0)
assert torch.allclose(O, O_torch, atol=1e-4) 

NameError: name 'module' is not defined

In [None]:
O = getattr(module, fname)(V)
O_torch = torch.softmax(V,  dim=0)

In [None]:
%%timeit -n 20
getattr(module, fname)(V)

51.5 μs ± 3.08 μs per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [None]:
%%timeit -n 20
torch.softmax(V,  dim=0)

21.3 μs ± 10.1 μs per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [None]:
profile_kernel(torch, "softmax", V,  dim=0)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          aten::softmax         0.55%       9.480us        99.79%       1.735ms       1.735ms       0.000us         0.00%       9.280us       9.280us             1  
                                         aten::_softmax         3.62%      62.900us        99.25%       1.726ms       1.726ms       4.640us       100.00%       9.280us       9.280us             1  
         

In [None]:
profile_kernel(module, "softmax", V)

---------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
---------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
    softmax_kernel(float*, float*, int)         0.00%       0.000us         0.00%       0.000us       0.000us      24.224us       100.00%      24.224us      24.224us             1  
                       aten::empty_like         1.33%      28.090us        95.56%       2.013ms       2.013ms       0.000us         0.00%       0.000us       0.000us             1  
                    aten::empty_strided         3.56%      74.961us        94.23%       1

In [None]:
cuda_source_path = "./softmax.cu"
fname = "softmax_tiled"
cuda_source = Path(cuda_source_path).read_text()
cpp_source = get_sig(fname, cuda_source)
module = load_cuda(cuda_source, cpp_source, funcs=[fname])

If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


In [None]:
D = 2048**2
V = torch.randn(D).contiguous().cuda()
O = getattr(module, fname)(V)
O_torch = torch.softmax(V,  dim=0)
assert torch.allclose(O, O_torch, atol=1e-4) 

In [None]:
%%timeit -n 20
getattr(module, fname)(V)

In [None]:
%%timeit -n 20
torch.softmax(V,  dim=0)

The slowest run took 4.87 times longer than the fastest. This could mean that an intermediate result is being cached.
15.3 μs ± 9.77 μs per loop (mean ± std. dev. of 7 runs, 20 loops each)


# TODO
setup the profiling to get info on dram etc (lesson 8?)