# Softman

## Python cuda looking implementation

In [1]:
import sys, os
from pathlib import Path

# Add the parent directory of the current notebook to sys.path
cur_dir = Path().resolve()
parent_dir = cur_dir.parent
sys.path += [str(parent_dir), str(cur_dir)]
from utils import cdiv, get_sig, load_cuda
from collections import namedtuple
import torch
from softmax_py import  softmax_py

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [2]:



V = torch.randn(32, dtype=torch.float32)

O = softmax_py(V)
# print("Softmax output:", O)

O_torch = torch.softmax(V, dim=0)
torch.allclose(O, O_torch, atol=1e-4)


True

# Cuda 
## naive and unsafe

In [3]:
cuda_source_path = "./softmax_fast.cu"
fname = "softmax"
cuda_source = Path(cuda_source_path).read_text()
cpp_source = get_sig(fname, cuda_source)
module = load_cuda(cuda_source, cpp_source, funcs=[fname])

If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


In [4]:
D = 1024
V = torch.randn(D).contiguous().cuda()
O = getattr(module, fname)(V)
O_torch = torch.softmax(V,  dim=0)
assert torch.allclose(O, O_torch, atol=1e-4) 

In [5]:
O = getattr(module, fname)(V)
O_torch = torch.softmax(V,  dim=0)

In [6]:
%%timeit -n 20
getattr(module, fname)(V)

55.4 μs ± 2.78 μs per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [7]:
%%timeit -n 20
torch.softmax(V,  dim=0)

22 μs ± 11.2 μs per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [10]:
import torch
from torch.profiler import profile, ProfilerActivity

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
             with_stack=True, record_shapes=True) as prof:
    # Your custom op
    module.softmax(V)

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))

---------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
---------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
    softmax_kernel(float*, float*, int)         0.00%       0.000us         0.00%       0.000us       0.000us      24.320us       100.00%      24.320us      24.320us             1  
                       aten::empty_like        45.28%       1.288ms        96.94%       2.758ms       2.758ms       0.000us         0.00%       0.000us       0.000us             1  
                    aten::empty_strided         1.94%      55.231us        51.66%       1

In [8]:
cuda_source_path = "./softmax.cu"
fname = "softmax_tiled"
cuda_source = Path(cuda_source_path).read_text()
cpp_source = get_sig(fname, cuda_source)
module = load_cuda(cuda_source, cpp_source, funcs=[fname])

If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


In [9]:
D = 2048**2
V = torch.randn(D).contiguous().cuda()
O = getattr(module, fname)(V)
O_torch = torch.softmax(V,  dim=0)
assert torch.allclose(O, O_torch, atol=1e-4) 

In [None]:
%%timeit -n 20
getattr(module, fname)(V)

32.1 μs ± 2.26 μs per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [None]:
%%timeit -n 20
torch.softmax(V,  dim=0)

The slowest run took 4.87 times longer than the fastest. This could mean that an intermediate result is being cached.
15.3 μs ± 9.77 μs per loop (mean ± std. dev. of 7 runs, 20 loops each)


# TODO
setup the profiling to get info on dram etc (lesson 8?)