**Mathematical expression of the algorithm**:

For one example $x^{(i)}$:
$$z^{(i)} = w^T x^{(i)} + b \tag{1}$$
$$\hat{y}^{(i)} = a^{(i)} = sigmoid(z^{(i)})\tag{2}$$ 
$$ \mathcal{L}(yhat^{(i)}, y^{(i)}) =  - y^{(i)}  \log(yhat^{(i)}) + (1-y^{(i)} )  \log(1-yhat^{(i)})\tag{3}$$

The cost is then computing:
$$ J = \frac{1}{m} \sum_{i=1}^m \mathcal{L}(yhat^{(i)}, y^{(i)})\tag{6}$$

Gradient Computing:
- $$ \frac{\partial J}{\partial w} = \frac{1}{m}X(yhat-Y)^T\tag{7}$$
- $$ \frac{\partial J}{\partial b} = \frac{1}{m} \sum_{i=1}^m (yhat^{(i)}-y^{(i)})\tag{8}$$


In [None]:
import numpy as np
import tensorflow as tf

## Weight and bias initializer

In [None]:
def initializer(input_dim: int) -> tuple:
    w = tf.zeros([input_dim, 1], dtype=tf.float64)
    b = 0.0
    return w, b

In [None]:
W, b = initializer(3)
W

In [None]:
tf.transpose(W)

## Calculate $z$ for all $x^{(i)}$:
$$z^{(i)} = w^T x^{(i)} + b \tag{1}$$

In [None]:
def forward(W: tf.Tensor, b: tf.float64, X: tf.Tensor):
    wT = tf.transpose(W)
    Z = tf.tensordot(wT, X, axes=1) + b
    return Z

In [None]:
X = tf.Variable(
    [
        [2, 4, -3],
        [3, 6, -2],
        [4, 6, -1]
        ], dtype=tf.float64
)
Y = tf.Variable([1], dtype=tf.float64)
tf.tensordot(tf.transpose(W), X, axes=1)

In [None]:
z = forward(W, b, X)
z

# Sigmoid Funtion
compute $sigmoid(z) = \frac{1}{1 + e^{-z}}$ for $z = w^T x + b$ to make predictions. Use np.exp() or tf.exp().

In [None]:
def sigmoid(Z: tf.Tensor):
    a = 1/(1 + tf.exp(-Z))
    return a

In [None]:
yhat = sigmoid(z)
yhat

## Calculate the Cost :
 $J = -\frac{1}{m}\sum_{i=1}^{m}(y^{(i)}\log(yhat^{(i)})+(1-y^{(i)})\log(1-yhat^{(i)}))$

In [None]:
def compute_cost(Y: tf.Tensor, Yhat: tf.Tensor):
    m = Yhat.shape[1]
    loss = tf.reduce_sum((Y * tf.math.log(Yhat)) + ((1-Y) * tf.math.log(1-Yhat)))
    c = (-1/m) * loss
    return c


In [None]:
compute_cost(Y, yhat)

## Forward Propagation:
- You get X
- You compute $yhat = \sigma(w^T X + b) $
- You calculate the cost function: $J = -\frac{1}{m}\sum_{i=1}^{m}(y^{(i)}\log(yhat^{(i)})+(1-y^{(i)})\log(1-yhat^{(i)}))$

In [None]:
def forward_prop(W: tf.Tensor, b: tf.Tensor, X: tf.Tensor, Y: tf.Tensor):
    Z = forward(W, b, X)      # forward
    Yhat = sigmoid(Z)         # activation
    cost = compute_cost(Y, Yhat)     # cost
    return Yhat, tf.squeeze(cost)


In [None]:
X = tf.Variable(
    [
        [2, 3, 4, 5, 6],
        [7, 2, 3, 4, 8],
    ], dtype=tf.float64
)
Y = tf.Variable([[1, 1, 0, 0, 1]], dtype=tf.float64)
Y.shape[1]

In [None]:
W, b = initializer(2)
Yhat, cost = forward_prop(W, b, X, Y)
Yhat, cost

In [None]:
tf.squeeze([[[[3.0, 2.0]]]])

## Back Propagation: 

- $$ \frac{\partial J}{\partial w} = \frac{1}{m}X(yhat-y)^T\tag{7}$$
- $$ \frac{\partial J}{\partial b} = \frac{1}{m} \sum_{i=1}^m (yhat^{(i)}-y^{(i)})\tag{8}$$

In [None]:
def back_prop(X: tf.Tensor, Yhat: tf.Tensor, Y: tf.Tensor) -> dict:
    m = Y.shape[1]
    loss = Yhat - Y
    dW = (1/m) * (tf.tensordot(X, tf.transpose(loss), axes=1))
    db = (1/m) * tf.reduce_sum(loss)
    return {'dW': dW, 'db':db}

In [None]:
grads = back_prop(X, Yhat, Y)
grads

# Optimizer

In [None]:

def optimizer(X: tf.Tensor, Y: tf.Tensor, epochs: int = 100, alpha=0.01):
    # initialize params W, b
    input_dim = X.shape[0]
    W, b = initializer(input_dim)

    # iterations
    for epoch in range(epochs):
        # forward propagation
        Yhat, cost = forward_prop(W, b, X, Y)

        # back propagation
        grads = back_prop(X, Yhat, Y)

        # update state
        W = W - (alpha * grads['dW'])
        b = b - (alpha * grads['db'])

        if epoch % 100 == 0:
            print(f"Epoch: {epoch} => Cost: {cost}")

    return W, b, grads, cost


In [None]:
X = tf.Variable(
    [
        [2, 3, 4, 5, 6],
        [7, 2, 3, 4, 8],
    ], dtype=tf.float64
)
Y = tf.Variable([[1, 1, 0, 0, 1]], dtype=tf.float64)
Y.shape[1]

In [None]:
optimizer(X, Y, alpha=0.1, epochs = 8000)

In [None]:
from sklearn.datasets import load_breast_cancer
X, Y = load_breast_cancer(return_X_y=True)

In [None]:
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)
X = tf.Variable(X.T, dtype=tf.float64)
Y = tf.Variable([Y], dtype=tf.float64)

In [None]:
Y.shape

In [None]:
optimizer(X, Y, epochs=10000, alpha=0.09)

# Model
- Initialize $$ w,b $$
- Forward Propagation:
    - You get X
    - You compute $yhat = \sigma(w^T X + b) $
    - You calculate the cost function: $J = -\frac{1}{m}\sum_{i=1}^{m}(y^{(i)}\log(yhat^{(i)})+(1-y^{(i)})\log(1-yhat^{(i)}))$
- Back Propagation: 
    - $$ \frac{\partial J}{\partial w} = \frac{1}{m}X(yhat-y)^T\tag{7}$$
    - $$ \frac{\partial J}{\partial b} = \frac{1}{m} \sum_{i=1}^m (yhat^{(i)}-y^{(i)})\tag{8}$$
- Update weights:
    - $$ w = w - {\alpha} * \frac{\partial J}{\partial w} $$
    - $$ b = b- {\alpha}  * \frac{\partial J}{\partial b} $$

In [None]:
class NeuralNetwork:
    """
    Pass
    """

    def __init__(self, input_dim: int):
        self.weights, self.bias = initializer(input_dim=input_dim)

    def initializer(input_dim: int) -> tuple:
        weights = tf.zeros([input_dim, 1], dtype=tf.float64)
        bias = 0.0
        return weights, bias

    def forward(self, X: tf.Tensor, Y: tf.Tensor):
        return tf.tensordot(tf.transpose(self.weights), X, axes=1) + self.bias

    def sigmoid(self, Z: tf.Tensor):
        return 1/(1 + (tf.exp(-Z)))

    def compute_cost(self, X: tf.Tensor, Y: tf.Tensor, Yhat: tf.Tensor):
        return (-1/Y.shape[1]) * tf.reduce_sum(Y * tf.math.log(Yhat) + ((1-Y) * tf.math.log(1-Yhat)))

    def forward_prop(self, X: tf.Tensor, Y: tf.Tensor):
        Z = self.forward(X, Y)
        Yhat = self.sigmoid(Z)
        self.cost = self.compute_cost(X, Y, Yhat)
        return Yhat, tf.squeeze(self.cost)

    def back_prop(self, X: tf.Tensor, Y: tf.Tensor, Yhat: tf.Tensor):
        m = Yhat.shape[1]
        dW = (1/m) * tf.tensordot(X, tf.traspose(Yhat - Y), axes=1)
        db = (1/m) * tf.reduce_sum(Yhat-Y)
        return {'dW': dW, 'db': db}

    def update_weights(self, grads: dict):
        self.weights = self.weights - (self.alpha * grads['dW'])
        self.bias = self.bias - (self.alpha * grads['db'])

    def optimize(self, X: tf.Tensor, Y: tf.Tensor, epochs: int = 1000, alpha=0.001):
        self.alpha = alpha
        for epoch in range(epochs):
            Yhat, self.cost = self.forward_prop(X, Y)
            grads = back_prop(X, Y, Yhat)
            self.update_weights(grads)
            if epoch % 100 == 0:
                print(f"Epoch: {epoch} => Cost: {self.cost}")


# Test Model

In [None]:
nn = NeuralNetwork(30)
nn.optimize(X, Y, epochs=3000)