In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import pandas as pd

# Mounting onto Google Colab
from google.colab import files 
files.download('examples.txt')

from google.colab import drive 
drive.mount('/content/drive')



In [2]:
import torch 
import torch.nn as nn 
import torch.nn.functional as F

"""
GELU with adjustable parameter a
SiLU with adjustable parameter a
ZiLU with adjustable parameter s
"""
class GELU_s(nn.Module):
    def __init__(self, sigma, inplace=False, max_val=1000):
        super(GELU_s, self).__init__()

        self.sigma = sigma
        self.max_val = max_val
        self.kAlpha = 0.70710678118654752440
        self.relu = nn.ReLU(inplace=inplace) 

    def forward(self, x):
        if self.sigma >= self.max_val:
            return self.relu(x) 
        else: 
            return x * 0.5 * (1 + torch.erf(self.sigma * x * self.kAlpha))

class SiLU_s(nn.Module):
    def __init__(self, sigma, inplace=False, max_val=1000):
        super(SiLU_s, self).__init__()
        self.sigma = sigma
        self.max_val = max_val
        self.relu = nn.ReLU(inplace=inplace)
        
    def forward(self, x):
        if self.sigma >= self.max_val:
            return self.relu(x)
        else:
            return x * torch.sigmoid(self.sigma * x)

class ZiLU_Old(nn.Module):
    def __init__(self, sigma, inplace=False, max_val=1000):
        super(ZiLU_Old, self).__init__()

        self.sigma = sigma
        self.max_val = max_val
        self.relu = nn.ReLU(inplace=inplace)

    def forward(self, x):
        if self.sigma >= self.max_val:
            return self.relu(x)
        else:
            return x * (2 * (1/4 + 1/(2 * torch.pi) * torch.arctan(self.sigma * x)))

"""
arctan 
arctan approximation 
ZiLU
ZiLU approximation
"""

class ArcTan(nn.Module):
    def __init__(self, sigma=None):
        super(ArcTan, self).__init__()
        if sigma: 
            self.sigma = sigma
        else: 
            self.sigma = nn.Parameter(torch.tensor(5.0))

    def forward(self, x):
        return 0.5 + (1.0 / torch.pi) * torch.arctan(self.sigma * x)

class ArcTan_Approx(nn.Module):
    def __init__(self, sigma=None):
        super(ArcTan_Approx, self).__init__()
        if sigma: 
            self.sigma = sigma
        else: 
            self.sigma = nn.Parameter(torch.tensor(5.0))

    def forward(self, x): 
        z = self.sigma * x 
        return (0.5 + torch.clamp(z, min=0)) / (1.0 + torch.abs(z))

class ZiLU(nn.Module):
    def __init__(self, sigma=None):
        super(ZiLU, self).__init__()
        self.arctan = ArcTan(sigma)
        
    def forward(self, x):
        return x * self.arctan(x)

class ZiLU_Approx(nn.Module):
    def __init__(self, sigma=None):
        super(ZiLU_Approx, self).__init__()
        self.arctan_approx = ArcTan_Approx(sigma)
        
    def forward(self, x):
        return x * self.arctan_approx(x)




In [3]:


def benchmark_activation(activation_fn, input_tensor, device, num_warmup=10, num_iterations=100, compile=True):
    """
    Benchmark forward and backward pass times for an activation function.
    
    Args:
        activation_fn: The activation function module
        input_tensor: Input tensor for testing
        device: Device to run on ('cpu', 'mps', 'cuda')
        num_warmup: Number of warmup iterations
        num_iterations: Number of timed iterations
    
    Returns:
        dict with forward_time and backward_time in milliseconds
    """
    activation_fn = activation_fn.to(device)
    input_tensor = input_tensor.to(device)

    if compile: 
        activation_fn = torch.compile(activation_fn)
        
    # Warmup
    for _ in range(num_warmup):
        output = activation_fn(input_tensor)
        if input_tensor.requires_grad:
            output.sum().backward()
            input_tensor.grad = None
    
    # Synchronize device
    if device == 'cuda':
        torch.cuda.synchronize()
    elif device == 'mps':
        torch.mps.synchronize()
    
    # Benchmark forward pass
    forward_times = []
    for _ in range(num_iterations):
        if device == 'cuda':
            torch.cuda.synchronize()
            start = time.perf_counter()
        elif device == 'mps':
            torch.mps.synchronize()
            start = time.perf_counter()
        else:
            start = time.perf_counter()
        
        output = activation_fn(input_tensor)
        
        if device == 'cuda':
            torch.cuda.synchronize()
        elif device == 'mps':
            torch.mps.synchronize()
        
        end = time.perf_counter()
        forward_times.append((end - start) * 1000)  # Convert to ms
    
    # Benchmark backward pass
    backward_times = []
    for _ in range(num_iterations):
        output = activation_fn(input_tensor)
        
        if device == 'cuda':
            torch.cuda.synchronize()
            start = time.perf_counter()
        elif device == 'mps':
            torch.mps.synchronize()
            start = time.perf_counter()
        else:
            start = time.perf_counter()
        
        output.sum().backward()
        
        if device == 'cuda':
            torch.cuda.synchronize()
        elif device == 'mps':
            torch.mps.synchronize()
        
        end = time.perf_counter()
        backward_times.append((end - start) * 1000)  # Convert to ms
        
        input_tensor.grad = None
    
    return {
        'forward_mean': sum(forward_times) / len(forward_times),
        'forward_std': torch.tensor(forward_times).std().item(),
        'backward_mean': sum(backward_times) / len(backward_times),
        'backward_std': torch.tensor(backward_times).std().item()
    }

def run_benchmarks():
    """Run benchmarks for all activation functions on all available devices."""
    
    # Test configuration
    batch_size = 64
    input_size = 1024
    sigma = 5.0
    
    # Create test input
    input_tensor = torch.randn(batch_size, input_size, requires_grad=True)
    
    # Activation functions to test
    activations = {
        "ReLU": nn.ReLU(),
        "SiLU": nn.SiLU(),
        "GELU": nn.GELU(),
        "Sigmoid": nn.Sigmoid(),
        "LeakyReLU": nn.LeakyReLU(),
        "PReLU": nn.PReLU(),
        "ELU": nn.ELU(),
        "Hardshrink": nn.Hardshrink(),
        "Softshrink": nn.Softshrink(),
        "Tanhshrink": nn.Tanhshrink(),
        "Hardtanh": nn.Hardtanh(),
        "Softplus": nn.Softplus(),
        "Softsign": nn.Softsign(),
        "Tanh": nn.Tanh(),
        "CELU": nn.CELU(),
        "Swish": nn.SiLU(),  # Swish is equivalent to SiLU
        "Mish": nn.Mish(),
        "HardSwish": nn.Hardswish(),
        "HardSigmoid": nn.Hardsigmoid(),
        "GELU_s": GELU_s(sigma=sigma),
        "SiLU_s": SiLU_s(sigma=sigma),
        "ZiLU_Old": ZiLU_Old(sigma=sigma),
        "ArcTan": ArcTan(sigma=sigma),
        "ArcTan_Approx": ArcTan_Approx(sigma=sigma),
        "ZiLU": ZiLU(sigma=sigma),
        "ZiLU_Approx": ZiLU_Approx(sigma=sigma)
    }
    
    # Determine available devices
    devices = ['cpu']
    if torch.cuda.is_available():
        devices.append('cuda')
    if torch.backends.mps.is_available():
        devices.append('mps')
    
    print(f"Available devices: {devices}")
    print(f"Input shape: {input_tensor.shape}\n")
    
    # Run benchmarks
    results = {}
    for device in devices:
        print(f"\n{'='*60}")
        print(f"Device: {device.upper()}")
        print(f"{'='*60}")
        
        results[device] = {}
        
        for name, activation_fn in activations.items():
            print(f"\nBenchmarking {name}...")
            
            try:
                # Create fresh input for each test
                test_input = input_tensor.clone().detach().requires_grad_(True)
                
                # Run benchmark
                timing = benchmark_activation(
                    activation_fn, 
                    test_input, 
                    device,
                    num_warmup=10,
                    num_iterations=100
                )
                
                results[device][name] = timing
                
                print(f"  Forward:  {timing['forward_mean']:.4f} ± {timing['forward_std']:.4f} ms")
                print(f"  Backward: {timing['backward_mean']:.4f} ± {timing['backward_std']:.4f} ms")
                
            except Exception as e:
                print(f"  Error: {e}")
                results[device][name] = None
    
    # Print summary table
    print(f"\n{'='*60}")
    print("SUMMARY TABLE")
    print(f"{'='*60}")
    
    for device in devices:
        print(f"\n{device.upper()}:")
        print(f"{'Activation':<20} {'Forward (ms)':<20} {'Backward (ms)':<20}")
        print("-" * 60)
        
        for name in activations.keys():
            if results[device].get(name):
                timing = results[device][name]
                fwd = f"{timing['forward_mean']:.4f} ± {timing['forward_std']:.4f}"
                bwd = f"{timing['backward_mean']:.4f} ± {timing['backward_std']:.4f}"
                print(f"{name:<20} {fwd:<20} {bwd:<20}")
            else:
                print(f"{name:<20} {'N/A':<20} {'N/A':<20}")
    
    return results

if __name__ == "__main__":
    results = run_benchmarks()

Available devices: ['cpu', 'mps']
Input shape: torch.Size([64, 1024])


Device: CPU

Benchmarking ReLU...
  Forward:  0.0549 ± 0.0182 ms
  Backward: 0.0804 ± 0.0415 ms

Benchmarking SiLU...
  Forward:  0.0509 ± 0.0490 ms
  Backward: 0.0918 ± 0.0156 ms

Benchmarking GELU...
  Forward:  0.0647 ± 0.0416 ms
  Backward: 0.1205 ± 0.0118 ms

Benchmarking Sigmoid...
  Forward:  0.0532 ± 0.0330 ms
  Backward: 0.0828 ± 0.0074 ms

Benchmarking LeakyReLU...


W1228 14:28:40.910000 24020 site-packages/torch/_dynamo/convert_frame.py:1016] [0/8] torch._dynamo hit config.recompile_limit (8)
W1228 14:28:40.910000 24020 site-packages/torch/_dynamo/convert_frame.py:1016] [0/8]    function: 'inner' (/Users/mingikang/miniconda3/envs/torch/lib/python3.11/site-packages/torch/_dynamo/external_utils.py:68)
W1228 14:28:40.910000 24020 site-packages/torch/_dynamo/convert_frame.py:1016] [0/8]    last reason: 0/7: ___check_type_id(fn, 4408782496)                       
W1228 14:28:40.910000 24020 site-packages/torch/_dynamo/convert_frame.py:1016] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W1228 14:28:40.910000 24020 site-packages/torch/_dynamo/convert_frame.py:1016] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.


  Forward:  0.0700 ± 0.0505 ms
  Backward: 0.0947 ± 0.0428 ms

Benchmarking PReLU...
  Forward:  0.0406 ± 0.0370 ms
  Backward: 0.0833 ± 0.0109 ms

Benchmarking ELU...
  Forward:  0.0472 ± 0.0346 ms
  Backward: 0.0939 ± 0.0113 ms

Benchmarking Hardshrink...
  Forward:  0.0562 ± 0.0386 ms
  Backward: 0.0850 ± 0.0278 ms

Benchmarking Softshrink...
  Forward:  0.0240 ± 0.0145 ms
  Backward: 0.0327 ± 0.0036 ms

Benchmarking Tanhshrink...
  Forward:  0.0878 ± 0.0376 ms
  Backward: 0.0868 ± 0.0350 ms

Benchmarking Hardtanh...
  Forward:  0.0297 ± 0.0447 ms
  Backward: 0.0322 ± 0.0049 ms

Benchmarking Softplus...
  Forward:  0.1119 ± 0.0338 ms
  Backward: 0.0726 ± 0.0052 ms

Benchmarking Softsign...
  Forward:  0.0696 ± 0.0447 ms
  Backward: 0.1669 ± 0.0452 ms

Benchmarking Tanh...
  Forward:  0.0646 ± 0.0198 ms
  Backward: 0.0421 ± 0.0037 ms

Benchmarking CELU...
  Forward:  0.0888 ± 0.0041 ms
  Backward: 0.0721 ± 0.0046 ms

Benchmarking Swish...
  Forward:  0.0624 ± 0.0217 ms
  Backward: 0.

In [4]:

def results_to_dataframe(results):
    """
    Convert benchmark results to a pandas DataFrame.
    
    Args:
        results: Nested dict with structure {device: {activation: {metric: value}}}
    
    Returns:
        pandas DataFrame with columns: Device, Activation, Forward_Mean, Forward_Std, Backward_Mean, Backward_Std
    """
    data = []
    
    for device, activations in results.items():
        for activation_name, timing in activations.items():
            if timing is not None:
                data.append({
                    'Device': device,
                    'Activation': activation_name,
                    'Forward_Mean (ms)': timing['forward_mean'],
                    'Forward_Std (ms)': timing['forward_std'],
                    'Backward_Mean (ms)': timing['backward_mean'],
                    'Backward_Std (ms)': timing['backward_std']
                })
            else:
                data.append({
                    'Device': device,
                    'Activation': activation_name,
                    'Forward_Mean (ms)': None,
                    'Forward_Std (ms)': None,
                    'Backward_Mean (ms)': None,
                    'Backward_Std (ms)': None
                })
    
    df = pd.DataFrame(data)
    return df



In [5]:
df = results_to_dataframe(results)
print(df)

df.to_csv("time_benchmark.csv", index=False)

   Device     Activation  Forward_Mean (ms)  Forward_Std (ms)  \
0     cpu           ReLU           0.054890          0.018215   
1     cpu           SiLU           0.050897          0.048995   
2     cpu           GELU           0.064667          0.041575   
3     cpu        Sigmoid           0.053166          0.033032   
4     cpu      LeakyReLU           0.070024          0.050513   
5     cpu          PReLU           0.040607          0.036993   
6     cpu            ELU           0.047236          0.034580   
7     cpu     Hardshrink           0.056221          0.038612   
8     cpu     Softshrink           0.024031          0.014457   
9     cpu     Tanhshrink           0.087765          0.037627   
10    cpu       Hardtanh           0.029659          0.044713   
11    cpu       Softplus           0.111935          0.033768   
12    cpu       Softsign           0.069600          0.044728   
13    cpu           Tanh           0.064608          0.019848   
14    cpu           CELU 

In [9]:
df1 = pd.read_csv("time_benchmark.csv")
df2 = pd.read_csv("./Output/time_benchmark.csv")

In [12]:
df_combined = pd.concat([df1, df2], ignore_index=True)
df_combined.to_csv("./Output/combined_time_benchmark.csv", index=False)

## GPU Details

In [22]:
print(torch.cuda.is_available())

True


In [27]:
import torch

# Check if CUDA is available
print(f"CUDA available: {torch.cuda.is_available()}")


# Get number of GPUs
print(f"Number of GPUs: {torch.cuda.device_count()}")

# Get current GPU name
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    
    # Get memory info (in bytes)
    print(f"Total memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"Allocated memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
    print(f"Cached memory: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")

CUDA available: True
Number of GPUs: 1
GPU Name: NVIDIA A100-SXM4-40GB
Total memory: 42.47 GB
Allocated memory: 0.00 GB
Cached memory: 0.05 GB


In [24]:
import subprocess

result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total,memory.free,memory.used', 
                        '--format=csv,noheader,nounits'], 
                       capture_output=True, text=True)
print(result.stdout)

NVIDIA A100-SXM4-40GB, 40960, 39962, 543



In [25]:
import pynvml

pynvml.nvmlInit()
device_count = pynvml.nvmlDeviceGetCount()

for i in range(device_count):
    handle = pynvml.nvmlDeviceGetHandleByIndex(i)
    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    
    print(f"GPU {i}: {pynvml.nvmlDeviceGetName(handle)}")
    print(f"  Total memory: {info.total / 1e9:.2f} GB")
    print(f"  Free memory: {info.free / 1e9:.2f} GB")
    print(f"  Used memory: {info.used / 1e9:.2f} GB")

pynvml.nvmlShutdown()

GPU 0: NVIDIA A100-SXM4-40GB
  Total memory: 42.95 GB
  Free memory: 41.90 GB
  Used memory: 1.05 GB


In [12]:
import sys 
import os 
print(sys.path) 
print(os.getcwd())

['/content', '/env/python', '/usr/lib/python312.zip', '/usr/lib/python3.12', '/usr/lib/python3.12/lib-dynload', '', '/usr/local/lib/python3.12/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.12/dist-packages/IPython/extensions', '/root/.ipython', '/tmp/tmpcdghsf86', '/usr/local/lib/python3.12/dist-packages/setuptools/_vendor']
/content
