📝 **Author:** Amirhossein Heydari - 📧 **Email:** amirhosseinheydari78@gmail.com - 📍 **Linktree:** [linktr.ee/mr_pylin](https://linktr.ee/mr_pylin)

---

# Dependencies

In [6]:
import matplotlib.pyplot as plt
import torch

## `torch`
   - Not commonly used directly in user code.

In [7]:
from torch import relu, sigmoid, sign, softmax, tanh

## `torch.nn`
   - Creates a module.
   - Can be used as a layer in a neural network.
   - Suitable for building complex models.

In [8]:
from torch.nn import ELU, GELU, LeakyReLU, LogSigmoid, LogSoftmax, Mish, ReLU, Sigmoid, SiLU, Softmax, Softplus, Tanh

## `torch.nn.functional`
   - Functional API for applying activation functions.
   - More flexible than `torch.nn` for custom operations.
   - Often used directly in model forward passes.
   - Provides more control over the computation graph.

In [9]:
from torch.nn.functional import elu, gelu, leaky_relu, logsigmoid, log_softmax, mish, relu, sigmoid, silu, softmax, softplus, tanh

# Activation Functions
   - Activation functions are used to introduce non-linearity into the neural network.
   - Without an activation function, a neural network would behave like a linear regression model, no matter how many layers it has!

<figure style="text-align: center;">
    <img src="../../assets/images/original/mlp/no-activation-network.svg" alt="no-activation-network.svg" style="width: 100%;">
    <figcaption style="text-align: center;">Neural Network without Any Activation Functions is just a Linear Transformation of Input to the Output</figcaption>
</figure>

📝 Docs:
   - Non-linear Activations (weighted sum, nonlinearity): [pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity](https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity)
   - Non-linear Activations (other): [pytorch.org/docs/stable/nn.html#non-linear-activations-other](https://pytorch.org/docs/stable/nn.html#non-linear-activations-other)
   - Non-linear activation functions: [pytorch.org/docs/stable/nn.functional.html#non-linear-activation-functions](https://pytorch.org/docs/stable/nn.functional.html#non-linear-activation-functions)

✍️ **Notes**:
   - Using Python functions is not a correct implementation of an activation function for Pytorch
   - The correct implementation is covered in the future notebooks

In [None]:
# domain [-10, +10]
x = torch.linspace(-10, +10, 1001)

# log
print(x)

tensor([-10.0000,  -9.9800,  -9.9600,  ...,   9.9600,   9.9800,  10.0000])


## Linear

In [None]:
def linear_func(x: torch.Tensor) -> torch.Tensor:
    return x

In [None]:
plt.figure(figsize=(4, 4))
plt.plot(x, linear_func(x))
plt.title("Linear")
plt.grid(True)
plt.show()

## Sigmoid
   - Historically used for `binary classification`, but less common now due to [vanishing gradient](https://towardsdatascience.com/the-vanishing-gradient-problem-69bf08b15484) issues.

In [None]:
def sigmoid_func(x: torch.Tensor) -> torch.Tensor:
    return 1 / (1 + torch.exp(-x))

In [None]:
plt.figure(figsize=(4, 4))
plt.plot(x, sigmoid_func(x))
plt.title("Sigmoid")
plt.grid(True)
plt.xlim(-10, 10)
plt.ylim(-4, 4)
plt.show()

## Hyperbolic Tangent (Tanh)
   - Similar to `sigmoid` but centered around 0, used in [recurrent neural networks (RNNs)](https://karpathy.github.io/2015/05/21/rnn-effectiveness/) and older architectures.

In [None]:
def tanh_func(x: torch.Tensor) -> torch.Tensor:
    exp_x = torch.exp(x)
    exp_neg_x = torch.exp(-x)
    return (exp_x - exp_neg_x) / (exp_x + exp_neg_x)

In [None]:
plt.figure(figsize=(4, 4))
plt.plot(x, tanh_func(x))
plt.title("Hyperbolic Tangent (Tanh)")
plt.grid(True)
plt.xlim(-10, 10)
plt.ylim(-4, 4)
plt.show()

## Softplus
   - Smooth approximation of `ReLU`.

In [None]:
def softplus_func(x: torch.Tensor) -> torch.Tensor:
    return torch.log(1 + torch.exp(x))

In [None]:
plt.figure(figsize=(4, 4))
plt.plot(x, softplus_func(x))
plt.title("Softplus")
plt.grid(True)
plt.xlim(-10, 10)
plt.ylim(-10, 10)
plt.show()

## LogSigmoid
   - Logarithm of `sigmoid`, less common but used in specific applications.

In [None]:
def logsigmoid_func(x: torch.Tensor) -> torch.Tensor:
    return torch.log(1 / (1 + torch.exp(-x)))

In [None]:
plt.figure(figsize=(4, 4))
plt.plot(x, logsigmoid_func(x))
plt.title("LogSigmoid")
plt.grid(True)
plt.xlim(-10, 10)
plt.ylim(-10, 10)
plt.show()

## Rectified Linear Unit (ReLU)
   - Most commonly used, computationally efficient, but suffers from the [dying ReLU](https://datascience.stackexchange.com/questions/5706/what-is-the-dying-relu-problem-in-neural-networks) ([vanishing gradient](https://towardsdatascience.com/the-vanishing-gradient-problem-69bf08b15484)) problem.

In [None]:
def relu_func(x: torch.Tensor) -> torch.Tensor:
    return torch.max(x, torch.tensor(0.0))

In [None]:
plt.figure(figsize=(4, 4))
plt.plot(x, relu_func(x))
plt.title("Rectified Linear Unit (ReLU)")
plt.grid(True)
plt.xlim(-10, 10)
plt.ylim(-10, 10)
plt.show()

## LeakyReLU
   - Addresses the `dying ReLU` problem by allowing a small, non-zero gradient for negative inputs.

In [None]:
def leaky_relu_func(x: torch.Tensor, negative_slope: float = 0.2) -> torch.Tensor:
    return torch.max(x, negative_slope * x)

In [None]:
plt.figure(figsize=(4, 4))
plt.plot(x, leaky_relu_func(x))
plt.title("LeakyReLU")
plt.grid(True)
plt.xlim(-10, 10)
plt.ylim(-10, 10)
plt.show()

## Exponential Linear Unit (ELU)
   - Similar to `LeakyReLU` but uses an exponential function for negative inputs, often providing better performance than `ReLU`.

In [None]:
def elu_func(x: torch.Tensor, alpha: int = 1.0) -> torch.Tensor:
    return torch.where(x > 0, x, alpha * (torch.exp(x) - 1))

In [None]:
plt.figure(figsize=(4, 4))
plt.plot(x, elu_func(x))
plt.title("Exponential Linear Unit (ELU)")
plt.grid(True)
plt.xlim(-10, 10)
plt.ylim(-10, 10)
plt.show()

## Sigmoid Linear Unit (SiLU)
   - Combines ReLU-like behavior with a smooth curve, often yielding better results than `ReLU`.

In [None]:
def silu_func(x: torch.Tensor) -> torch.Tensor:
    return x * torch.sigmoid(x)

In [None]:
plt.figure(figsize=(4, 4))
plt.plot(x, silu_func(x))
plt.title("Sigmoid Linear Unit (SiLU)")
plt.grid(True)
plt.xlim(-10, 10)
plt.ylim(-10, 10)
plt.show()

## Mish
   - Self-regularized activation function, generally performs better than `ReLU` and its variants.

In [None]:
def mish_func(x: torch.Tensor) -> torch.Tensor:
    return x * torch.tanh(torch.nn.functional.softplus(x))

In [None]:
plt.figure(figsize=(4, 4))
plt.plot(x, mish_func(x))
plt.title("Mish")
plt.grid(True)
plt.xlim(-10, 10)
plt.ylim(-10, 10)
plt.show()

## Softmax
   - Used for `multi-class classification`, outputs probabilities [[mutually exclusive](https://en.wikipedia.org/wiki/Softmax_function)] for each class, often used `internally` in `CrossEntropyLoss`.

In [None]:
def softmax_func(x: torch.Tensor, dim=None) -> torch.Tensor:
    if dim is None:
        dim = len(x.shape) - 1
    exp_x = torch.exp(x - x.max(dim=dim, keepdim=True).values)
    return exp_x / exp_x.sum(dim=dim, keepdim=True)

In [None]:
plt.figure(figsize=(4, 4))
plt.plot(x, softmax_func(x))
plt.title("Softmax")
plt.grid(True)
plt.xlim(-10, 10)
plt.ylim(-0.05, 0.05)
plt.show()

## LogSoftmax
   - Logarithm of softmax, often used in `NLLLoss`.
   - Reducing the risk of numerical issues and ensuring more reliable calculations rather than `Softmax`.

In [None]:
def logsoftmax_func(x: torch.Tensor, dim=None) -> torch.Tensor:
    if dim is None:
        dim = len(x.shape) - 1
    softmax_x = torch.nn.functional.softmax(x, dim=dim)
    return torch.log(softmax_x)

In [None]:
plt.figure(figsize=(4, 4))
plt.plot(x, logsoftmax_func(x))
plt.title("LogSoftmax")
plt.grid(True)
plt.xlim(-10, 10)
plt.ylim(-25, 0)
plt.show()

## Gaussian Error Linear Units (GeLU)
   - Approximates the expected value of `ReLU` with a Gaussian input, often used in `transformer-based` models.

In [None]:
def gelu_func(x: torch.Tensor) -> torch.Tensor:
    return x * 0.5 * (1.0 + torch.erf(x / 2.0 ** 0.5))

In [None]:
plt.figure(figsize=(4, 4))
plt.plot(x, gelu_func(x))
plt.title("Gaussian Error Linear Units (GeLU)")
plt.grid(True)
plt.xlim(-10, 10)
plt.ylim(-10, 10)
plt.show()

## Plot Activation Functions

In [None]:
fig, axs = plt.subplots(nrows=3, ncols=4, figsize=(12, 8), layout='compressed')
fig.suptitle("Activation Functions")
axs[0, 0].plot(x, relu_func(x))
axs[0, 0].set(title='Rectified Linear Unit (ReLU)', xlim=[-10, 10], ylim=[-10, 10])
axs[0, 1].plot(x, leaky_relu(x))
axs[0, 1].set(title='LeakyReLU', xlim=[-10, 10], ylim=[-10, 10])
axs[0, 2].plot(x, elu_func(x))
axs[0, 2].set(title='Exponential Linear Unit (ELU)', xlim=[-10, 10], ylim=[-10, 10])
axs[0, 3].plot(x, silu_func(x))
axs[0, 3].set(title='Sigmoid Linear Unit (SiLU)', xlim=[-10, 10], ylim=[-10, 10])
axs[1, 0].plot(x, mish_func(x))
axs[1, 0].set(title='Mish', xlim=[-10, 10], ylim=[-10, 10])
axs[1, 1].plot(x, sigmoid_func(x))
axs[1, 1].set(title='Sigmoid', xlim=[-10, 10], ylim=[-4, 4])
axs[1, 2].plot(x, tanh_func(x))
axs[1, 2].set(title='Hyperbolic Tangent (Tanh)', xlim=[-10, 10], ylim=[-4, 4])
axs[1, 3].plot(x, softplus_func(x))
axs[1, 3].set(title='Softplus', xlim=[-10, 10], ylim=[-10, 10])
axs[2, 0].plot(x, logsigmoid_func(x))
axs[2, 0].set(title='LogSigmoid', xlim=[-10, 10], ylim=[-10, 10])
axs[2, 1].plot(x, softmax_func(x))
axs[2, 1].set(title='Softmax', xlim=[-10, 10], ylim=[-0.05, 0.05])
axs[2, 2].plot(x, logsoftmax_func(x))
axs[2, 2].set(title='LogSoftmax', xlim=[-10, 10], ylim=[-25, 0])
axs[2, 3].plot(x, gelu_func(x))
axs[2, 3].set(title='Gaussian Error Linear Units (GeLU)', xlim=[-10, 10], ylim=[-10, 10])
for ax in fig.axes:
    ax.grid(True)
plt.show()

# Threshold Functions
   - Threshold functions are a simpler type of activation function primarily used in the early development of neural networks
   - These functions decide whether a neuron should be activated or not based on whether the input surpasses a certain threshold

## Step

In [None]:
def step_func(x: torch.Tensor) -> torch.Tensor:
    return torch.where(x >= 0, torch.ones_like(x), torch.zeros_like(x))

In [None]:
plt.figure(figsize=(4, 4))
plt.plot(x, step_func(x))
plt.title("Step")
plt.grid(True)
plt.xlim(-10, 10)
plt.ylim(-2, 2)
plt.show()

## Sign

In [None]:
def sign_func(x: torch.Tensor) -> torch.Tensor:
    return torch.where(x > 0, torch.ones_like(x), torch.where(x < 0, torch.ones_like(x) * -1, torch.zeros_like(x)))

In [None]:
plt.figure(figsize=(4, 4))
plt.plot(x, sign_func(x))
plt.title("Sign")
plt.grid(True)
plt.xlim(-10, 10)
plt.ylim(-2, 2)
plt.show()

## Plot Threshold Functions

In [None]:
# plot
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(8, 4), layout='compressed')
fig.suptitle("Threshold Functions")
axs[0].plot(x, step_func(x))
axs[0].grid(True)
axs[0].set(title='step', xlim=[-10, 10], ylim=[-2, 2])
axs[1].plot(x, sign_func(x))
axs[1].grid(True)
axs[1].set(title='sign', xlim=[-10, 10], ylim=[-2, 2])
plt.show()