# Softmax (1D)



In [1]:
import sys, os
from pathlib import Path

# Add the parent directory of the current notebook to sys.path
cur_dir = Path().resolve()
parent_dir = cur_dir.parent
sys.path += [str(parent_dir), str(cur_dir)]


os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [2]:
import torch
from utils import cdiv, get_sig, load_cuda, profile_kernel
from collections import namedtuple
from softmax_py import  softmax_py

def test_allclose(kernels):
    D = 1024
    V_test = torch.randn(D).contiguous().cuda()
    O_torch = torch.softmax(V_test,  dim=0)
    for kernel_name, kernel_data in kernels.items():
        if kernel_name!="torch":
            module, fname = kernel_data["module"], kernel_data["fname"]
            O = getattr(module, fname)(V_test)
            if not torch.allclose(O, O_torch, atol=1e-4):
                raise ValueError(f"{kernel_name=} failed:\n\n {O[:10]=}, {O_torch[:10]=}")
            print(f"{kernel_name=} agrees with torch softmax")
        


def profile_kernels(kernels):
    test_allclose(kernels)
    for kernel_name, kernel_data in kernels.items():
        print(f"Profiling: {kernel_name}")
        profile_kernel(kernel_data["module"], kernel_data["fname"], *kernel_data["args"], **kernel_data["kwargs"])


## Python cuda looking implementation

In [3]:
V_small = torch.randn(32, dtype=torch.float32).contiguous().cuda()
O = softmax_py(V_small)
O_torch = torch.softmax(V_small, dim=0)
torch.allclose(O, O_torch, atol=1e-4)



True

## Cuda 

In [4]:
def get_modules(kernels):
    for kernel_name, kernel_data in kernels.items():

        fname = kernel_data["fname"]
        cuda_source = Path(kernel_data["cuda_source_path"]).read_text()
        cpp_source = get_sig(fname, cuda_source)
        module = load_cuda(cuda_source, cpp_source, funcs=[fname])
        kernel_data["module"] = module


def get_softmax_modules(kernels):
    get_modules(kernels)
    kernels["torch"] = {
        "module": torch,
        "fname": "softmax",
    }

def add_args_kwargs(kernels, *args, **kwargs):
    for kernel_name, kernel_data in kernels.items():
        kernel_data["args"]= args
        if kernel_name == "torch":
            kernel_data["kwargs"] = kwargs
        else: kernel_data["kwargs"] = {}


In [5]:
D = 1024
V = torch.randn(D, dtype=torch.float32).contiguous().cuda()

kernels = {
    "softmax_naive": dict(cuda_source_path = "./softmax_naive.cu", fname = "softmax_naive"),
    "softmax_fast": dict(cuda_source_path = "./softmax_fast.cu", fname = "softmax_fast"),
    "softmax_tiled": dict(cuda_source_path = "./softmax_tiled.cu", fname = "softmax_tiled"),
}
get_softmax_modules(kernels)
add_args_kwargs(kernels, V, dim=0)


If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


## Profile

In [6]:
profile_kernels(kernels)


kernel_name='softmax_naive' agrees with torch softmax
kernel_name='softmax_fast' agrees with torch softmax
kernel_name='softmax_tiled' agrees with torch softmax
Profiling: softmax_naive
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
              softmax_naive_kernel(float*, float*, int)         0.00%       0.000us         0.00%       0.000us       0.000us      36.320us        96.43%      36.320us      36.320us             1  
                     

## Appendix

In [7]:
# O = getattr(module, fname)(V)
# O_torch = torch.softmax(V,  dim=0)

In [8]:
# %%timeit -n 20
# getattr(module, fname)(V)

In [9]:
%%timeit -n 20
torch.softmax(V,  dim=0)

26.7 μs ± 9.2 μs per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [10]:
# profile_kernel(torch, "softmax", V,  dim=0)

In [11]:
# profile_kernel(module, "softmax", V)

In [12]:
# cuda_source_path = "./softmax.cu"
# fname = "softmax_tiled"
# cuda_source = Path(cuda_source_path).read_text()
# cpp_source = get_sig(fname, cuda_source)
# module = load_cuda(cuda_source, cpp_source, funcs=[fname])

In [13]:
# D = 2048**2
# V = torch.randn(D).contiguous().cuda()
# O = getattr(module, fname)(V)
# O_torch = torch.softmax(V,  dim=0)
# assert torch.allclose(O, O_torch, atol=1e-4) 

In [14]:
# %%timeit -n 20
# getattr(module, fname)(V)

In [15]:
# %%timeit -n 20
# torch.softmax(V,  dim=0)

# TODO
setup the profiling to get info on dram+warps etc (lesson 8?)