In [2]:
import math

import cufflinks as cf
import numpy as np
import pandas as pd
from tensorflow import keras 
from IPYNBrenderer import render_google_doc

cf.go_offline()

In [3]:
import warnings

warnings.filterwarnings('ignore')

In [8]:
URL = 'https://drive.google.com/file/d/1kRcEyQ6o8HTrplt7Kh29Ho6zaiJCbzmK/view?usp=sharing'
render_google_doc(URL , height="800" , width="50%" )

'success'

# Sigmoid Implementation
![image info](./Assets/sigmod.png)

$$ h_ \theta (x) =  \frac{\mathrm{1} }{\mathrm{1} + e^- \theta^Tx }  $$ 

In [4]:
# sigmoid function deep learning
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [5]:
X = np.linspace(-10,10,100)
y = sigmoid(X)

In [11]:
pd.DataFrame({'x':X,'y':y}).iplot(kind='scatter',x='x',y='y',mode='lines',xTitle='x',yTitle='y',title='sigmoid function')

# gradient of the sigmoid function
$$ \frac{d\sigma}{dx}=\sigma(1-\sigma)




In [17]:

def sigmoid_derivative(x):
    return sigmoid(x)*(1-sigmoid(x))

In [18]:
derviative_sigmoid =  sigmoid_derivative(X)

In [16]:
pd.DataFrame({'x':X,'y':y,'y1':derviative_sigmoid}).iplot(kind='scatter',x='x',y=['y','y1'],mode='lines',xTitle='x',yTitle='y',title='sigmoid function and gradient of the sigmoid function')

# Adavantage of sigmoid function
 1. It is non linear
 2. It is differentiable 
 3. It  has a range of 0 to 1 , prevents jump in the output
    - normalise the output
    - helps to reduce the solution Space

# disadvantage of sigmoid function 
1. prone to gradient vanshing     
2. expontential term make it calculation consuming = latencty  
$$ \frac{d\sigma}{dx}=  ∈ {0,25}

# Tanh or hyperbolic tanget activation function deep learning
$$ tanh(x) = \frac{e^x - e^{-x}}{e^x + e^{-x}} = \frac{1 - e^{-2x}}{1 + e^{-2x}}

In [19]:
X = np.linspace(-10,10,100)

In [29]:
def tanh(x):
    return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x)) # np.tanh(x) = (e^x - e^-x)/(e^x + e^-x)

In [30]:
y = tanh(X)

$$ \frac{d\tanh(Z)} \ = 1- tahh(Z)^2 =  ∈ {0, 1} 

In [37]:
def tanh_derivative(x):
    return 1 - tanh(x)**2

In [38]:
derviative_tanh =  tanh_derivative(X)

In [42]:
pd.DataFrame({'x':X,'y':y,'gradient':derviative_tanh}).iplot(kind='scatter',x='x',y=['y' , "gradient"],mode='lines',xTitle='x',yTitle='y',title='Tanh function and gradient of the tanh function')

# Advantage of Tanh function.
1. It is zero centered
2. It is not saturated for large values of x
3. It is differentiable at origin
4. syemtric around origin
5. sloves the vanishing gradient problem provided proper weight initialization
6. suitable for hidden layers
# Distadvantage of tanh function
1. It has a saturation register_converter
2. latency in computation as it has exponential terms
3. not suitable for output layer

# relu or Rectified Linear Unit
$$ Relu(Z) = max(Z, 0) = range(Z,0)

In [44]:
X= np.linspace(-10,10,100)

In [45]:
def relu(x):
    return np.maximum(0,x)

In [46]:
y = relu(X)

In [48]:
def relu_derivative(x):
    return np.where(x>0,1,0)

In [49]:
derviative_relu =  relu_derivative(X)

In [50]:
pd.DataFrame({'x':X,'y':y,'gradient':derviative_relu}).iplot(kind='scatter',x='x',y=['y' , "gradient"],mode='lines',xTitle='x',yTitle='y',title='Relu function and gradient of the relu function')

# Advantage of Relu function
1. calculates the gradient faster than sigmoid and tanh function as no exponential term 
2. for all positive input there is no gradients saturation 
# Disadvantage 
1. for all -ve inputs Relu is completely inactivate 
2. at Zero its derivative is not defined -> Jump 

# leaky relu function

$$ LeakyReLU(Z) = \max(\Z,\ αZ) 

In [63]:
x = np.linspace(-10,10,100)

In [79]:
def leaky_relu(x , alpha=0.01):
    return np.where(x>0,x,alpha*x)

In [80]:
y = leaky_relu(x)

In [81]:
def leaky_relu_derivative(x , alpha=0.01):
    return np.where(x>0,1,alpha)

In [82]:
derivative_leaky_relu =  leaky_relu_derivative(x)

In [88]:
y1 = leaky_relu(x,alpha= 0.5)

In [89]:
pd.DataFrame({'x':x,'y':y,"alpha=1":y1,'gradient':derivative_leaky_relu}).iplot(kind='scatter',x='x',y=['y' ,"alpha=1","gradient"],mode='lines',xTitle='x',yTitle='y',title='Leaky Relu function and gradient of the Leaky relu function')

# advantage of leaky relu activation function
1. It does not suffer from the dying relu problem
2. no gradient vanishing problem 
3. It is computationally efficient 
# disadvantage of leaky relu activation function
1. It is not zero centered
2. at x=0 the gradient is not defined


# Prelu activation function
Parameter relu is a modified version of relu function where the slope is a parameter that can be learned during training.

$$ f(Z) = Z \ if Z ≥ 0  \ else \ α*Z

α = learnable parameter or trainable

Prelu(Z)= if Z ≥ 0 Z else α*Z

Main convention is the α is a learnable parameter.

if α Z <= 0 Relu 

elif α Z < 0 Leaky Relu 

# Adavantage of Prelu 
1. It does not suffer from the dying relu problem
2. α is a trainable parameter
3. α will be adapative to data 

# Disadvantage of Prelu
1. extra parameter to train - computationally expensive

In [3]:
keras.layers.PReLU(alpha_initializer='zeros', alpha_regularizer=None, alpha_constraint=None, shared_axes=None)

<keras.layers.activation.prelu.PReLU at 0x7f9605e533d0>

In [92]:
URL = "https://drive.google.com/file/d/1V7aF2G0jYk0jTp8E2FCYCLfS5yTFdTsX/view?usp=sharing"
render_google_doc(URL , height="800" , width="75%" )

'success'

In [4]:
# softmax  activation function

$$ \ softmax(z_i) = \frac{e^{z_{i}}}{\sum_{j=1}^K e^{z_{j}}} \ \ \ for\ i=1,2,\dots,K

In [5]:
def softmax(x):
    return np.exp(x)/np.sum(np.exp(x))

In [6]:
X = np.linspace(-10,10,100)

# usually used in the output layer of the neural network for multi-class classification
it is a generalization of the logistic function to multiple classes
it is a function that takes as input a vector of K real numbers, and normalizes it into a probability distribution consisting of K probabilities
the output of the softmax function is a vector of probabilities that sum to 1
for binary classification, the softmax function is equivalent to the logistic function or sigmoid function


Signature: keras.activations.softmax(x, axis=-1)


Docstring:
Softmax converts a vector of values to a probability distribution.

The elements of the output vector are in range (0, 1) and sum to 1.

Each vector is handled independently. The `axis` argument sets which axis
of the input the function is applied along.

Softmax is often used as the activation for the last
layer of a classification network because the result could be interpreted as
a probability distribution.

The softmax of each vector x is computed as
`exp(x) / tf.reduce_sum(exp(x))`.

# ELU or exponential Linear unit activation function    
$$ ELU(X) = Z \ if Z ≥ 0  \ else \ α*Z

α = scaler value , scaler value for the -ve section

derviation 
$$\ elu `(z, α) = 1 \ if Z ≥ 0  \ else \ α (e^Z -1)

In [11]:
X = np.linspace(-10,10,100)

In [12]:
def elu(x,alpha=1):
    return np.where(x>0,x,alpha*(np.exp(x)-1))

In [13]:
y = elu(X)

In [14]:
def elu_derivative(x,alpha=1):
    return np.where(x>0,1,alpha*np.exp(x))

In [15]:
derivative_elu =  elu_derivative(X)

In [16]:
pd.DataFrame({'x':X,'y':y,'gradient':derivative_elu}).iplot(kind='scatter',x='x',y=['y' , "gradient"],mode='lines',xTitle='x',yTitle='y',title='Elu function and gradient of the elu function')

# advantage of elu
1. no dead relu issue 
2. approx zero centric 

# disadvantage of elu
1. computationally expensive

# scaled exponential linear unit (SELU)

$$ \ selu(z) = \ scalar * elu(z, alpha) $$


$$ \ selu(z) = \ K * elu(z, alpha) $$


# handson-ml2

![image info](./Assets/weight%20initialization.png)
![image info](./Assets/activationfunction.png)