# **Activation Functions**

## **The linear activation function**

In [None]:
## the input is the output
def linear(z, m):
  return m*z

In [None]:
## m is slope or multiplier
## m = 1
x1 = 2
x2 = 5
w1 = 0.5
w2 = -1.5
b = 10

## the sum function
z = b + w1*x1 + w2*x2

## activation
linear(z, 1)

3.5

In [None]:
## the derivative has to be efficiently computed
## if m = 1
## linear_prime = 1
## if m = c
## linear_prime  = c
def linear_prime(z, m):
  return m

In [None]:
linear_prime(z, 1)

1

### **Tensorflow Implementation**

We can compute the activation of a vector. The function also computes the derivative on its own.

In [None]:
import tensorflow as tf

In [None]:
z = tf.constant([2, 1.5, -3, 10, 6], dtype = tf.float32)
a = tf.keras.activations.linear(z)
a.numpy()

array([ 2. ,  1.5, -3. , 10. ,  6. ], dtype=float32)

## **ReLu**

This is a recent activation. It allows very fast computation. It can cut off the signal of a neuron.

In [None]:
import numpy as np
def relu(z):
  return np.max((0,z))

In [None]:
z = b + w1*x1 + w2*x2
print(z)

3.5


In [None]:
relu(z)

3.5

In [None]:
## negative z
z = -10

In [None]:
relu(z)

0

In [None]:
## derivative
def relu_prime(z):
  return 1 if z > 0 else 0

In [None]:
print(f'z is {z}')
print(f'the derivative of relu is {relu_prime(z)}')

z is -10
the derivative of relu is 0


In [None]:
z = 3.5
print(f'z is {z}')
print(f'the derivative of relu is {relu_prime(z)}')

z is 3.5
the derivative of relu is 1


### **Tensorflow implementation**

In [None]:
z = tf.constant([2, 1.5, -3, 10, 6], dtype = tf.float32)
tf.keras.activations.relu(z)

<tf.Tensor: shape=(5,), dtype=float32, numpy=array([ 2. ,  1.5,  0. , 10. ,  6. ], dtype=float32)>

## **Sigmoid - Logistic Activation**

Outputs from 0 to 1.

In [None]:
def sigmoid(z):
  return 1/(1+np.exp(-z))

In [None]:
## z is large positive value
## sigmoid(z) will be close to 1
z = -2
print(f'The value of z is {z}')
print(f'The sigmoid activation is {sigmoid(z)}')

The value of z is -2
The sigmoid activation is 0.11920292202211755


In [None]:
## the derivative is
## a*(1-a)
def sigmoid_prime(z):
  return sigmoid(z)*(1-sigmoid(z))

In [None]:
z = -10
print(f'The value of z is {z}')
print(f'The sigmoid activation is {sigmoid(z)}')
print(f'The derivative is {sigmoid_prime(z)}')

The value of z is -10
The sigmoid activation is 4.5397868702434395e-05
The derivative is 4.5395807735951673e-05


In [None]:
## wnew = wold - step*gradient
## if gradient is zero, then new weights = old weights
## the vanishing gradient
## feed-forward NNs do not work well with images

## **Tanh Activation**

One of the top-performing activation, however, very hard computationally.

In [None]:
def tanh(z):
  return (np.exp(z) - np.exp(-z)) / (np.exp(z) + np.exp(-z))

In [None]:
def tanh_prime(z):
  return 1 - np.power(tanh(z), 2)

In [None]:
## Example
z = -10
print(f'The value of z is {z}')
print(f'The tanh activation is {tanh(z)}')
print(f'The derivative of tanh activation is {tanh_prime(z)}')

The value of z is -10
The tanh activation is -0.9999999958776926
The derivative of tanh activation is 8.244614768671e-09


In [None]:
## the vanishing gradient is an issue here

### **Tensorflow implementation**

In [None]:
z = tf.constant([2, 1.5, -3, 10, 6], dtype = tf.float32)

In [None]:
tf.keras.activations.sigmoid(z)

<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([0.8807971 , 0.8175745 , 0.04742587, 0.9999546 , 0.99752736],
      dtype=float32)>

In [None]:
tf.keras.activations.tanh(z)

<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([ 0.9640276 ,  0.9051482 , -0.9950547 ,  1.        ,  0.99998784],
      dtype=float32)>

## **Softmax activation**

Used in the output layer of multi-class problems because the probabilities sum to one

In [None]:
z = np.array([1.4, 4, 2.1, -3])

In [None]:
np.exp(1.4) / (np.exp(1.4) + np.exp(4) + np.exp(2.1) + np.exp(-3))

0.06064366672741932

#**Backpropagation**

In [None]:
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/martinwg/ISA630/master/data/housing_data.csv')
df.head()

Unnamed: 0,Bedrooms,Area,City_Distance,Age,Price
0,1,26.184098,1286.68,67,96004.804557
1,1,34.866901,1855.25,30,92473.72257
2,1,36.980709,692.09,24,98112.51994
3,1,17.445723,1399.49,66,92118.326874
4,1,52.587646,84.65,3,98976.653176


In [None]:
## create X matrix
## y vector
X = df.drop('Price', axis = 1).values
y = df.Price.values

In [None]:
## no hidden layers
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense

## instance
nn = Sequential()
nn.add(Input(shape = X.shape[1]))  # input layer
nn.add(Dense(1, activation = "linear")) # output layer

In [None]:
## number of parameters
## 4 * 1 + 1
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1)                 5         
                                                                 
Total params: 5 (20.00 Byte)
Trainable params: 5 (20.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
## the weights (parameters) are randomly initializec
## limit = sqrt(6 / (fan_in + fan_out))
import numpy as np
np.sqrt(6 / (4+1))

1.0954451150103321

In [None]:
[-1.0954451150103321, 1.0954451150103321]

[-1.0954451150103321, 1.0954451150103321]

In [None]:
## the values of weights
## are untrained - bad results
nn.weights

[<tf.Variable 'dense/kernel:0' shape=(4, 1) dtype=float32, numpy=
 array([[ 0.796713  ],
        [ 0.1376977 ],
        [ 0.42014325],
        [-0.07861173]], dtype=float32)>,
 <tf.Variable 'dense/bias:0' shape=(1,) dtype=float32, numpy=array([0.], dtype=float32)>]

In [None]:
## predictions
nn.predict(X)



array([[539.72516],
       [782.7102 ],
       [294.77914],
       ...,
       [710.9223 ],
       [801.34766],
       [116.32672]], dtype=float32)

In [None]:
## network loss function calculates this
(y - nn.predict(X))**2



array([[9.07250483e+09, 8.41230439e+09, 9.47846580e+09, ...,
        8.65804351e+09, 8.49339590e+09, 8.64675698e+09],
       [8.99686418e+09, 8.33947375e+09, 9.40114784e+09, ...,
        8.58415447e+09, 8.42021430e+09, 8.57291622e+09],
       [9.14101824e+09, 8.47828266e+09, 9.54849244e+09, ...,
        8.72497665e+09, 8.55969079e+09, 8.71364657e+09],
       ...,
       [9.07238503e+09, 8.41218903e+09, 9.47834335e+09, ...,
        8.65792648e+09, 8.49327999e+09, 8.64664003e+09],
       [8.98589982e+09, 8.32891768e+09, 9.38993977e+09, ...,
        8.57344462e+09, 8.40960725e+09, 8.56221339e+09],
       [9.19459966e+09, 8.52988807e+09, 9.60325336e+09, ...,
        8.77732633e+09, 8.61154299e+09, 8.76596230e+09]])

In [None]:
## calculate the cost function (sum or average)
np.sum((y - nn.predict(X))**2) / X.shape[0]



39063808124768.67

In [None]:
## compile
import tensorflow as tf
nn.compile(optimizer = "adam", loss = "mean_squared_error", metrics = [tf.keras.metrics.R2Score()])

In [None]:
X.shape

(4308, 4)

In [None]:
batch = 100
X.shape[0] / 100

43.08

In [None]:
## fit
nn.fit(X, y, batch_size = 100, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7b8a7197ba00>

In [None]:
## weights
nn.weights

[<tf.Variable 'dense/kernel:0' shape=(4, 1) dtype=float32, numpy=
 array([[1.1300153 ],
        [0.4703951 ],
        [0.75382495],
        [0.25546932]], dtype=float32)>,
 <tf.Variable 'dense/bias:0' shape=(1,) dtype=float32, numpy=array([0.33407843], dtype=float32)>]

## **Goal**

Get the best weights and the best bias. Tuning those weights to the optimal is the goal. A trained model has the best parameters given the data. We call a model that has been trained a "pre-trained" model. E.g., ChatGPT is a pre-trained neural net.

In [None]:
## a deeper NN
nn1 = Sequential()
nn1.add(Input(shape = X.shape[1]))
nn1.add(Dense(10, activation = "relu")) ## H1 the derivative is {0,1}
nn1.add(Dense(5, activation = "relu"))  ## H2
nn1.add(Dense(1, activation = "linear" ))  ## regression (linear)

In [None]:
(4*10 + 10) + (10*5 + 5) + (5*1 + 1)

111

In [None]:
nn1.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1 (Dense)             (None, 10)                50        
                                                                 
 dense_2 (Dense)             (None, 5)                 55        
                                                                 
 dense_3 (Dense)             (None, 1)                 6         
                                                                 
Total params: 111 (444.00 Byte)
Trainable params: 111 (444.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
nn1.compile(optimizer = "adam", loss = "mean_squared_error", metrics = [tf.keras.metrics.RootMeanSquaredError()])

In [None]:
nn1.fit(X, y, batch_size = 32, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7b8a736d9a80>

In [None]:
## R2
## R2 = SSR/SST

## comparing against the prediction y = ybar (average)

In [None]:
nn1.weights

[<tf.Variable 'dense_1/kernel:0' shape=(4, 10) dtype=float32, numpy=
 array([[ 4.884289  ,  0.08256906, -0.19915605,  0.15013611,  4.6261144 ,
          4.796598  ,  3.9248853 , -0.15050946,  4.2314506 , -0.28682336],
        [ 1.8514935 , -0.37287036, -0.2393775 , -0.41574317,  2.331868  ,
          2.8497903 ,  1.7188504 ,  0.01952243,  1.8543427 , -0.6148599 ],
        [ 0.69465536, -0.21028906, -0.06992441, -0.4817329 ,  0.44170555,
         -0.39742118,  0.8978245 , -0.14551088,  0.98666334, -0.63004446],
        [ 4.878283  , -0.25329795, -0.14383328, -0.03242987,  5.608528  ,
          5.5731544 ,  4.7261195 , -0.60346663,  4.7189054 ,  0.12387103]],
       dtype=float32)>,
 <tf.Variable 'dense_1/bias:0' shape=(10,) dtype=float32, numpy=
 array([ 6.344052  ,  0.        ,  0.        ,  0.        ,  6.407063  ,
         6.592677  ,  6.314718  , -0.02290215,  6.318184  ,  0.        ],
       dtype=float32)>,
 <tf.Variable 'dense_2/kernel:0' shape=(10, 5) dtype=float32, numpy=
 arra