In [6]:
import random
import math
import numpy as np

import random

def get_training_samples(batch_size):
  """
  Yields batches of randomly selected lines from the training data.

  Args:
      batch_size: The desired number of samples in each batch.

  Yields:
      A tuple containing:
          labels: A list of labels for the samples in the batch.
          inputs: A list of input vectors for the samples in the batch.
  """

  with open("train.csv") as file:
    text = file.read()
  textlines = text.strip().split("\n")

  # Randomly shuffle the data lines for better training
  random.shuffle(textlines)

  while True:  # Loop indefinitely to keep yielding batches
    batch_start = random.randint(0, len(textlines) - batch_size)
    batch_end = min(batch_start + batch_size, len(textlines))  # Limit end to file length

    labels = []
    inputs = []
    for textline in textlines[batch_start:batch_end]:
      cells = textline.split(",")
      labels.append(int(cells[0]))
      inputs.append([float(c) for c in cells[11:]])

    yield labels, inputs

def get_training_samples(batch_size):
    with open("train.csv") as file:
        text = file.read()
    textlines = text.strip().split("\n")
    random.shuffle(textlines)
    start = 0
    while start < len(textlines):
        labels = []
        targets = []
        inputs = []
        end = start + batch_size
        for textline in textlines[start:end]:
            cells = textline.split(",")
            labels.append(int(cells[0]))
            targets.append([float(c) for c in cells[1:11]])
            inputs.append([float(c) for c in cells[11:]])
        #yield labels, targets, inputs
        yield labels, inputs
        start += batch_size

def get_test_samples():
    with open("test.csv", "r") as file:
        text = file.read()
    textlines = text.strip().split("\n")
    labels = []
    targets = []
    inputs = []
    for textline in textlines:
        cells = textline.split(",")
        value = int(cells[0])
        labels.append(int(cells[0]))
        targets.append([float(c) for c in cells[1:11]])
        inputs.append([float(c) for c in cells[11:]])
    return labels, targets, inputs

def plot_number(inputs):
    line = ""
    for p in inputs:
        line += ".░▒▓█"[round(p * 4)]
        if len(line) > 27:
            print(line)
            line = ""

def one_hot_encode(labels, num_classes):
    """
    Convert a list of numerical labels to one-hot encoded format.

    Args:
    labels (list or np.ndarray): List of numerical labels to be one-hot encoded.
    num_classes (int): Total number of classes or unique labels.

    Returns:
    np.ndarray: One-hot encoded matrix of shape (len(labels), num_classes).
    """
    # Create an array of zeros with shape (len(labels), num_classes)
    one_hot = np.zeros((len(labels), num_classes), dtype=int)

    # Set the appropriate elements to 1
    one_hot[np.arange(len(labels)), labels] = 1

    return one_hot

# Example usage


In [7]:
def softmax(predictions):
    m = max(predictions)
    temp = [math.exp(p - m) for p in predictions]
    total = sum(temp)
    return [t / total for t in temp]

def sigmoid(value):
    # Clip value to avoid overflow in exp
    clipped_value = np.clip(value, -500, 500)  # You can adjust these limits based on your actual value range
    return 1 / (1 + np.exp(-clipped_value))
    
#def log_loss(activations, targets):
#    losses = [-t * math.log(a) - (1 - t) * math.log(1 - a) for a, t in zip(activations, targets)]
#    return sum(losses)

def relu(x):
    return np.maximum(0, x)

def log_loss(activations, targets):
    # Clipping values to avoid math domain error
    clipped_activations = [max(1e-15, min(a, 1 - 1e-15)) for a in activations]
    losses = [-t * math.log(a) - (1 - t) * math.log(1 - a) for a, t in zip(clipped_activations, targets)]
    return sum(losses)

def clip_gradient(gradient, min_value= -0.2, max_value=0.2):
    """
    Clip the gradient to a specified range.

    :param gradient: The calculated gradient, which can be an array.
    :param min_value: The minimum allowed value for the gradient.
    :param max_value: The maximum allowed value for the gradient.
    :return: The clipped gradient.
    """
    return np.clip(gradient, min_value, max_value)


# -------------------------------
# Weight Initialisation
# -------------------------------

def initialise_weight(in_channel, out_channel):
    """
    """
    W = np.random.randn(in_channel, out_channel).astype(np.float32) * np.sqrt(2.0/(in_channel))
    return W


def initialise_bias(out_channel):
    """
    """
    b = np.zeros(out_channel).astype(np.float32)
    return b


# -------------------------------
# Loss Functions
# -------------------------------

def BCELoss(x, y, derivative=False):
    """
    """
    def _BCE_loss_forward(x, y):
        loss = np.sum(- y * np.log(x + eps) + - (1 - y) * np.log((1 - x) + eps))
        return loss

    def _BCE_loss_derivative(x, y):
        dloss = -y * (1 / (x + eps))
        return dloss
    
    if derivative:
        return _BCE_loss_derivative(x, y)
    else:
        return _BCE_loss_forward(x, y)


def MSELoss(x, y, derivative=False):
    """
    """
    def _MSE_loss_forward(x, y):
        loss = (np.square(y - x)).mean()
        return loss

    def _MSE_loss_derivative(x, y):
        dloss = 2 * (x - y)
        return dloss
    
    if derivative:
        return _MSE_loss_derivative(x, y)
    else:
        return _MSE_loss_forward(x, y)


# -------------------------------
# Activation Functions
# -------------------------------

def sigmoid(x, derivative=False):
    #res = 1/(1+np.exp(-x))
    clipped_value = np.clip(x, -1000, 1000)  # You can adjust these limits based on your actual value range
    res = 1 / (1 + np.exp(-clipped_value))
    if derivative:
        return res*(1-res)
    return res

def relu(x, derivative=False):
    res = x
    if derivative:
        return 1.0 * (res > 0)
    else:
        return res * (res > 0)   
    
def lrelu(x, alpha=0.01, derivative=False):
    res = x
    if derivative:
        dx = np.ones_like(res)
        dx[res < 0] = alpha
        return dx
    else:
        return np.maximum(x, x*alpha, x)

def tanh(x, derivative=False):
    res = np.tanh(x)
    if derivative:
        return 1.0 - np.tanh(x) ** 2
    return res

def one_hot_encode(labels, num_classes):
    """
    Convert a list of numerical labels to one-hot encoded format.

    Args:
    labels (list or np.ndarray): List of numerical labels to be one-hot encoded.
    num_classes (int): Total number of classes or unique labels.

    Returns:
    np.ndarray: One-hot encoded matrix of shape (len(labels), num_classes).
    """
    # Create an array of zeros with shape (len(labels), num_classes)
    one_hot = np.zeros((len(labels), num_classes), dtype=int)

    # Set the appropriate elements to 1
    one_hot[np.arange(len(labels)), labels] = 1

    return one_hot

# Example usage
labels = [0, 1, 2, 3, 0, 2, 1]  # Example list of labels
num_classes = 4  # Assuming 4 classes for this example

one_hot_encoded = one_hot_encode(labels, num_classes)
print(one_hot_encoded)

[[1 0 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 0 1]
 [1 0 0 0]
 [0 0 1 0]
 [0 1 0 0]]


In [58]:
import numpy as np

epochs = 100
batch_size = 20
learning_rate = 0.001
input_count, hidden_count, mean_Count, variance_Count, hidden_cnt, output_count = 784, 262, 152, 152, 262, 784

W1 = np.random.randn(input_count + 10, hidden_count).astype(np.float32) * np.sqrt(2.0/(input_count + 10))
b1 = np.zeros(hidden_count).astype(np.float32)

mu_Weight = np.random.randn(hidden_count, mean_Count).astype(np.float32) * np.sqrt(2.0/(hidden_count))
mu_Bias = np.zeros(mean_Count).astype(np.float32)

sd_Weight = np.random.randn(hidden_count, variance_Count).astype(np.float32) * np.sqrt(2.0/(hidden_count))
sd_Bias = np.zeros(variance_Count).astype(np.float32)

W2 = np.random.randn(mean_Count + 10, hidden_cnt).astype(np.float32) * np.sqrt(2.0/(mean_Count))
b2 = np.zeros(hidden_cnt).astype(np.float32)

W3 = np.random.randn(hidden_cnt, output_count).astype(np.float32) * np.sqrt(2.0/(hidden_cnt))
b3 = np.zeros(output_count).astype(np.float32)

In [59]:
class AdamOptimizer:
    def __init__(self, parameters, learning_rate=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.momentum = [np.zeros_like(param) for param in parameters]
        self.velocity = [np.zeros_like(param) for param in parameters]
        self.t = 0

    def update(self, parameters, grads):
        self.t += 1  # Update time step
        updated_params = []
        for i, (param, grad) in enumerate(zip(parameters, grads)):
            self.momentum[i] = self.beta1 * self.momentum[i] + (1 - self.beta1) * grad
            self.velocity[i] = self.beta2 * self.velocity[i] + (1 - self.beta2) * np.power(grad, 2)
            
            m_hat = self.momentum[i] / (1 - np.power(self.beta1, self.t))
            v_hat = self.velocity[i] / (1 - np.power(self.beta2, self.t))
            
            param_update = self.learning_rate * m_hat / (np.sqrt(v_hat) + self.eps)
            parameters[i] -= param_update
            updated_params.append(parameters[i])
        return updated_params

In [60]:
lr = 0.01
momentum = [0.0] * 10
velocity = [0.0] * 10
beta1 = 0.9
beta2 = 0.9999
#t = 0

def optimise(grads):
    t = 0
    t += 1
        # Calculate gradient with momentum and velocity
    for i, grad in enumerate(grads):
        #print(i)
        momentum[i] = beta1 * momentum[i] + (1 - beta1) * grad
        velocity[i] = beta2 * velocity[i] + (1 - beta2) * np.power(grad, 2)
        m_h = momentum[i] / (1 - (beta1 ** t))
        v_h = velocity[i] /  (1 - (beta2 ** t))
        grads[i] = m_h / np.sqrt(v_h + eps)

        #print('---------encoder------------',grad_W0.shape, grad_b0.shape, grad_W_mu.shape, grad_b_mu.shape, grad_W_logvar.shape, grad_b_logvar.shape)
        W3, b3, W2, b2, W1, b1, mu_Weight, mu_Bias, sd_Weight, sd_Bias = grads
        



In [None]:
import numpy as np
beta = 3

for epoch in range(epochs):
    index = 0
    for labels, inputs in get_training_samples(batch_size):
        index += 1

        inputs = np.array(inputs)
        targets = inputs.copy()
        #print(labels)
        one_hot_encoded = np.array(one_hot_encode(labels, 10))
        #print(one_hot_encoded)
        inputConcatenate =  np.concatenate([inputs, one_hot_encoded], axis = 1)
        #print(inputs.shape, inputConcatenate.shape)

        hValue = np.dot(inputConcatenate, W1) + b1
        hNeuron = sigmoid(hValue)

        #mean and standard deviation
        muNeuron = (np.dot(hNeuron, mu_Weight) + mu_Bias)
        sdNeuron = (np.dot(hNeuron, sd_Weight) + sd_Bias)
        #print(hNeuron.shape, sdNeuron.shape, muNeuron.shape)      
        randomSample =  np.random.standard_normal(size=(batch_size, muNeuron.shape[1]))
        zSpace = muNeuron + np.exp(sdNeuron * 0.5) * randomSample
        zSpace = np.concatenate([zSpace, one_hot_encoded], axis = 1)
        #print(zSpace.shape)
        
        # Decoder
        hdValue = np.dot(zSpace, W2) + b2
        hdNeuron = sigmoid(hdValue)
        oValue = np.dot(hdNeuron, W3) + b3
        output = sigmoid(oValue)
        
        # Decoder Backpropagation
        d_error = MSELoss(output, inputs, derivative=True)
        d_pred = sigmoid(oValue, derivative=True)
        d_output = d_error * d_pred
        
        #d_b3 = d_output
        d_b3 = np.sum(d_output, axis=0)
        #print(d_b3.shape)
        d_W3 = np.dot(hdNeuron.T, d_output)

        d_hdNeuron = np.dot(d_output, W3.T) * sigmoid(hdValue, derivative=True)
        d_b2 = np.sum(d_hdNeuron, axis=0)
        #d_b2 = d_hdNeuron
        d_W2 = np.dot(zSpace.T, d_hdNeuron)
        
        d_zSpace = np.dot(d_hdNeuron, W2.T)
        d_zSpace = d_zSpace[:, :152]
        #print('---------',d_zSpace.shape)

        # Encoder Backpropagation 
        d_muNeuron = d_zSpace
        d_muBias = np.sum(d_muNeuron, axis = 0)
        d_muWeight = np.dot(hNeuron.T, d_muNeuron) 

        d_sdNeuron = d_zSpace * np.exp(sdNeuron * .5) * .5 * randomSample
        #d_sdBias = d_sdNeuron
        d_sdBias = np.sum(d_sdNeuron, axis = 0)
        d_sdWeight = np.dot(hNeuron.T, d_sdNeuron)

        hNeuronDerivative = sigmoid(hValue, derivative=True)
        dhNeuron = hNeuronDerivative * (np.dot(d_muBias, mu_Weight.T) + np.dot(d_sdNeuron, sd_Weight.T))  
        db1 = np.sum(dhNeuron, axis = 0)
        #db1 = dhNeuron
        dW1 = np.dot(inputConcatenate.T, dhNeuron)
        
        dk1_muNeuron = .5 * 2 * muNeuron
        dkl_muBias = np.sum(dk1_muNeuron, axis = 0)
        #dkl_muBias = dk1_muNeuron
        dkl_muWeight = np.dot(hNeuron.T, dk1_muNeuron) * beta

        dk1_sdNeuron = .5 * (np.exp(sdNeuron) - 1)
        dkl_sdBias = np.sum(dk1_sdNeuron, axis = 0)
        #dkl_sdBias = dk1_sdNeuron
        dkl_sdWeight = np.dot(hNeuron.T, dk1_sdNeuron) * beta
  
        dkl_hNeuron = hNeuronDerivative * (np.dot(dk1_muNeuron, mu_Weight.T) + np.dot(dk1_sdNeuron, sd_Weight.T))
        dkl_W1 = np.dot(inputConcatenate.T, dkl_hNeuron)
        dkl_b1 = np.sum(dkl_hNeuron, axis = 0)
        #dkl_b1 = dkl_hNeuron

        grad_b_logvar = dkl_sdBias + d_sdBias
        grad_W_logvar = dkl_sdWeight + d_sdWeight
        grad_b_mu = dkl_muBias + d_muBias
        grad_W_mu = dkl_muWeight + d_muWeight
        grad_b1 = dkl_b1 + db1
        grad_W1 = dkl_W1 + dW1   
        #print(grad_W1.shape)

        d_W3 = clip_gradient(d_W3)
        d_b3 = clip_gradient(d_b3)
        d_W2 = clip_gradient(d_W2)
        d_b2 = clip_gradient(d_b2)
        grad_W1 = clip_gradient(grad_W1)
        grad_b1 = clip_gradient(grad_b1)
        grad_W_mu = clip_gradient(grad_W_mu)
        grad_b_mu = clip_gradient(grad_b_mu)
        grad_W_logvar = clip_gradient(grad_W_logvar)
        grad_b_logvar = clip_gradient(grad_b_logvar)

        #grads = [W3, b3, W2, b2, W1, b1, mu_Weight, mu_Bias, sd_Weight, sd_Bias]
        #optimise(grads)
        
        # Update weights and biases
        W3 -= learning_rate * d_W3
        b3 -= learning_rate * d_b3
        W2 -= learning_rate * d_W2
        b2 -= learning_rate * d_b2
        W1 -= learning_rate * grad_W1
        b1 -= learning_rate * grad_b1
        mu_Weight -= learning_rate * grad_W_mu
        mu_Bias -= learning_rate * grad_b_mu
        sd_Weight -= learning_rate * grad_W_logvar
        sd_Bias -= learning_rate * grad_b_logvar

        klbloss = -0.5 * np.sum(1 + sdNeuron - muNeuron**2 - np.exp(2 * sdNeuron)) / batch_size

        #loss = -0.5 * np.sum(1 + self.latent_logvar - self.latent_mu**2 - np.exp(self.latent_logvar)) / (self.batch_size * self.latent_dim)
        rec_loss = MSELoss(output, inputs)
        totalLoss = rec_loss + klbloss
        #print('---------------', index, '---------------')
        print(totalLoss)

    

In [62]:
labels, targets, inputs = get_test_samples()

hNeuron = sigmoid(np.dot(np.concatenate([inputs,one_hot_encode(labels, 10)], axis = 1), W1) + b1)
#print(pred_h[3].shape)

muNeuron = (np.dot(hNeuron, mu_Weight) + mu_Bias)
sdNeuron = (np.dot(hNeuron, sd_Weight) + sd_Bias)

#print(muNeuron.shape)

randomSample = np.random.standard_normal(size=(1000, muNeuron.shape[1]))
zSpace = muNeuron + np.exp(sdNeuron * 0.5) * randomSample

hdNeuron = sigmoid(np.dot(np.concatenate([zSpace,one_hot_encode(labels, 10)], axis = 1), W2) + b2)
output = sigmoid(np.dot(hdNeuron, W3) + b3)


for v, i in zip(inputs[100:105], output[100:105]):
    #print(v)
    plot_number(v)
    plot_number(i)

............................
............................
............▒▓░.............
...........░█▓▒.............
...........█▒...............
..........▒█................
..........█▒................
..........█░................
.........░█.................
.........▓█.................
.........▓▓......░▒▒▒░......
.........█░.....▒██▓▓█░.....
.........█░....▓█▒...░▓.....
........░█....▒█░.....█.....
........░█....█░......▓░....
........░█░..▒█.......▓▒....
.........█▒..░█░......▓░....
.........▓█..░█░.....▒█.....
.........░█▒..▒█▒▒▒░░█░.....
..........▓█▒..░▒▒▒▒▓░......
...........▒██▒▒▒▓█▓░.......
............░▒▓█▓▒░.........
............................
............................
............................
............................
............................
............................
............................
............................
................░░░.........
.............░░░░░░░░.......
............░░░▒▒▒░░░.......
...........░░░▒▒▒▒░░........
..........░░░▒

In [343]:
lr = 0.01
momentum = [0.0] * 10
velocity = [0.0] * 10
beta1 = 0.9
beta2 = 0.9999
#t = 0

def optimise(grads):
    t = 0
    t += 1
        # Calculate gradient with momentum and velocity
    for i, grad in enumerate(grads):
        #print(i)
        momentum[i] = beta1 * momentum[i] + (1 - beta1) * grad
        velocity[i] = beta2 * velocity[i] + (1 - beta2) * np.power(grad, 2)
        m_h = momentum[i] / (1 - (beta1 ** t))
        v_h = velocity[i] /  (1 - (beta2 ** t))
        grads[i] = m_h / np.sqrt(v_h + eps)

        #print('---------encoder------------',grad_W0.shape, grad_b0.shape, grad_W_mu.shape, grad_b_mu.shape, grad_W_logvar.shape, grad_b_logvar.shape)
        W3, b3, W2, b2, W1, b1, mu_Weight, mu_Bias, sd_Weight, sd_Bias = grads
        



In [344]:
import numpy as np

epochs = 400
batch_size = 1000
learning_rate = 0.0001
input_count, hidden_count, mean_Count, variance_Count, hidden_cnt, output_count = 784, 262, 200, 200, 262, 784

W1 = np.random.randn(input_count, hidden_count).astype(np.float32) * np.sqrt(2.0/(input_count))
b1 = np.zeros(hidden_count).astype(np.float32)

mu_Weight = np.random.randn(hidden_count, mean_Count).astype(np.float32) * np.sqrt(2.0/(hidden_count))
mu_Bias = np.zeros(mean_Count).astype(np.float32)

sd_Weight = np.random.randn(hidden_count, variance_Count).astype(np.float32) * np.sqrt(2.0/(hidden_count))
sd_Bias = np.zeros(variance_Count).astype(np.float32)

W2 = np.random.randn(mean_Count, hidden_cnt).astype(np.float32) * np.sqrt(2.0/(mean_Count))
b2 = np.zeros(hidden_cnt).astype(np.float32)

W3 = np.random.randn(hidden_cnt, output_count).astype(np.float32) * np.sqrt(2.0/(hidden_cnt))
b3 = np.zeros(output_count).astype(np.float32)

In [None]:
import numpy as np

for epoch in range(epochs):
    index = 0
    for labels, inputs in get_training_samples(batch_size):
        index += 1

        inputs = np.array(inputs)
        targets = inputs.copy()

        hValue = np.dot(inputs, W1) + b1
        hNeuron = sigmoid(hValue)

        #mean and standard deviation
        muNeuron = (np.dot(hNeuron, mu_Weight) + mu_Bias)
        sdNeuron = (np.dot(hNeuron, sd_Weight) + sd_Bias)
        #print(hNeuron.shape, sdNeuron.shape, muNeuron.shape)      
        randomSample =  np.random.standard_normal(size=(batch_size, muNeuron.shape[1]))
        zSpace = muNeuron + np.exp(sdNeuron * 0.5) * randomSample
        #print(zSpace.shape)
        
        # Decoder
        hdValue = np.dot(zSpace, W2) + b2
        hdNeuron = sigmoid(hdValue)
        oValue = np.dot(hdNeuron, W3) + b3
        output = sigmoid(oValue)
        
        # Decoder Backpropagation
        d_error = MSELoss(output, inputs, derivative=True)
        d_pred = sigmoid(oValue, derivative=True)
        d_output = d_error * d_pred
        
        #d_b3 = d_output
        d_b3 = np.sum(d_output, axis=0)
        #print(d_b3.shape)
        d_W3 = np.dot(hdNeuron.T, d_output)

        d_hdNeuron = np.dot(d_output, W3.T) * sigmoid(hdValue, derivative=True)
        d_b2 = np.sum(d_hdNeuron, axis=0)
        #d_b2 = d_hdNeuron
        d_W2 = np.dot(zSpace.T, d_hdNeuron)
        
        d_zSpace = np.dot(d_hdNeuron, W2.T)
        print('---------',d_zSpace.shape)

        # Encoder Backpropagation 
        d_muNeuron = d_zSpace
        d_muBias = np.sum(d_muNeuron, axis = 0)
        d_muWeight = np.dot(hNeuron.T, d_muNeuron) 

        d_sdNeuron = d_zSpace * np.exp(sdNeuron * .5) * .5 * randomSample
        #d_sdBias = d_sdNeuron
        d_sdBias = np.sum(d_sdNeuron, axis = 0)
        d_sdWeight = np.dot(hNeuron.T, d_sdNeuron)

        hNeuronDerivative = sigmoid(hValue, derivative=True)
        dhNeuron = hNeuronDerivative * (np.dot(d_muBias, mu_Weight.T) + np.dot(d_sdNeuron, sd_Weight.T))  
        db1 = np.sum(dhNeuron, axis = 0)
        #db1 = dhNeuron
        dW1 = np.dot(inputs.T, dhNeuron)
        
        dk1_muNeuron = .5 * 2 * muNeuron
        dkl_muBias = np.sum(dk1_muNeuron, axis = 0)
        #dkl_muBias = dk1_muNeuron
        dkl_muWeight = np.dot(hNeuron.T, dk1_muNeuron) 

        dk1_sdNeuron = .5 * (np.exp(sdNeuron) - 1)
        dkl_sdBias = np.sum(dk1_sdNeuron, axis = 0)
        #dkl_sdBias = dk1_sdNeuron
        dkl_sdWeight = np.dot(hNeuron.T, dk1_sdNeuron)
  
        dkl_hNeuron = hNeuronDerivative * (np.dot(dk1_muNeuron, mu_Weight.T) + np.dot(dk1_sdNeuron, sd_Weight.T))
        dkl_W1 = np.dot(inputs.T, dkl_hNeuron)
        dkl_b1 = np.sum(dkl_hNeuron, axis = 0)
        #dkl_b1 = dkl_hNeuron

        grad_b_logvar = dkl_sdBias + d_sdBias
        grad_W_logvar = dkl_sdWeight + d_sdWeight
        grad_b_mu = dkl_muBias + d_muBias
        grad_W_mu = dkl_muWeight + d_muWeight
        grad_b1 = dkl_b1 + db1
        grad_W1 = dkl_W1 + dW1     

        grads = [W3, b3, W2, b2, W1, b1, mu_Weight, mu_Bias, sd_Weight, sd_Bias]
        optimise(grads)
        
        # Update weights and biases
        W3 -= learning_rate * d_W3
        b3 -= learning_rate * d_b3
        W2 -= learning_rate * d_W2
        b2 -= learning_rate * d_b2
        W1 -= learning_rate * grad_W1
        b1 -= learning_rate * grad_b1
        mu_Weight -= learning_rate * grad_W_mu
        mu_Bias -= learning_rate * grad_b_mu
        sd_Weight -= learning_rate * grad_W_logvar
        sd_Bias -= learning_rate * grad_b_logvar

        klbloss = -0.5 * np.sum(1 + sdNeuron - muNeuron**2 - np.exp(2 * sdNeuron)) / batch_size

        #loss = -0.5 * np.sum(1 + self.latent_logvar - self.latent_mu**2 - np.exp(self.latent_logvar)) / (self.batch_size * self.latent_dim)
        rec_loss = MSELoss(output, inputs)
        totalLoss = rec_loss + klbloss
        #print('---------------', index, '---------------')
        print(totalLoss)

    

In [313]:
lr = 0.01
momentum = [0.0] * 10
velocity = [0.0] * 10
beta1 = 0.9
beta2 = 0.9999
#t = 0

def optimise(grads):
    t = 0
    t += 1
        # Calculate gradient with momentum and velocity
    for i, grad in enumerate(grads):
        #print(i)
        momentum[i] = beta1 * momentum[i] + (1 - beta1) * grad
        velocity[i] = beta2 * velocity[i] + (1 - beta2) * np.power(grad, 2)
        m_h = momentum[i] / (1 - (beta1 ** t))
        v_h = velocity[i] /  (1 - (beta2 ** t))
        grads[i] = m_h / np.sqrt(v_h + eps)

        #print('---------encoder------------',grad_W0.shape, grad_b0.shape, grad_W_mu.shape, grad_b_mu.shape, grad_W_logvar.shape, grad_b_logvar.shape)
        W3, b3, W2, b2, W1, b1, mu_Weight, mu_Bias, sd_Weight, sd_Bias = grads

        # Update weights and biases
        W3 -= learning_rate *  np.sum(d_W3, axis=0)
        b3 -= learning_rate * np.sum(d_b3, axis=0)
        W2 -= learning_rate * np.sum(d_W2, axis=0)
        b2 -= learning_rate * np.sum(d_b2, axis=0)
        W1 -= learning_rate * np.sum(grad_W1, axis=0)
        b1 -= learning_rate * np.sum(grad_b1, axis=0)
        mu_Weight -= learning_rate * np.sum(grad_W_mu, axis= 0)
        mu_Bias -= learning_rate * np.sum(grad_b_mu, axis = 0)
        sd_Weight -= learning_rate * np.sum(grad_W_logvar, axis = 0)
        sd_Bias -= learning_rate * np.sum(grad_b_logvar, axis = 0)

        return


In [315]:
import numpy as np

for epoch in range(epochs):
    index = 0
    for labels, inputs in get_training_samples(batch_size):
        index += 1

        inputs = np.array(inputs)
        targets = inputs.copy()

        hValue = np.dot(inputs, W1) + b1
        hNeuron = relu(hValue)

        #mean and standard deviation
        muNeuron = (np.dot(hNeuron, mu_Weight) + mu_Bias)
        sdNeuron = (np.dot(hNeuron, sd_Weight) + sd_Bias)
        #print(hNeuron.shape, sdNeuron.shape, muNeuron.shape)      
        randomSample =  np.random.standard_normal(size=(batch_size, muNeuron.shape[1]))
        zSpace = muNeuron + np.exp(sdNeuron * 0.5) * randomSample
        #print(zSpace.shape)
        
        # Decoder
        hdValue = np.dot(zSpace, W2) + b2
        hdNeuron = relu(hdValue)
        oValue = np.dot(hdNeuron, W3) + b3
        output = sigmoid(oValue)

        #output = np.reshape(hdNeuron, (batch_size, 131))
        
        # Decoder Backpropagation
        d_error = MSELoss(output, inputs, derivative=True)
        d_pred = sigmoid(oValue, derivative=True)
        d_output = d_error * d_pred
        
        #d_b3 = d_output
        d_b3 = np.sum(d_output, axis=0)
        #print(d_b3.shape)
        d_W3 = np.dot(hdNeuron.T, d_output)

        d_b3 = d_output
        d_W3 = np.matmul(np.expand_dims(hdNeuron, axis=-1), np.expand_dims(d_output, axis=1))

        drelu0 = relu(hdValue, derivative=True)
        d_b2 = d_b3.dot(W3.T) * drelu0
        d_W2 = np.matmul(np.expand_dims(zSpace, axis=-1), np.expand_dims(d_b2, axis=1))

        d_zSpace = d_b2.dot(W2.T)

        # Encoder Backpropagation 
        d_muBias = d_zSpace
        #d_muBias = np.sum(d_muNeuron, axis = 0)
        d_muWeight = np.matmul(np.expand_dims(hNeuron, axis=-1), np.expand_dims(d_muBias, axis=1))        

        d_sdBias = d_zSpace * np.exp(sdNeuron * .5) * .5 * randomSample
        d_sdWeight = np.matmul(np.expand_dims(hNeuron, axis=-1), np.expand_dims(d_sdBias, axis=1))

        drelu = relu(hValue, derivative=True)
        db1 = drelu * (d_muBias.dot(mu_Weight.T) + d_sdBias.dot(sd_Weight.T))
        dW1 = np.matmul(np.expand_dims(inputs, axis=-1), np.expand_dims(db1, axis=1))        

        dkl_muBias = .5 * 2 * muNeuron
        dKL_W_mu = np.matmul(np.expand_dims(hNeuron, axis=-1), np.expand_dims(dkl_muBias, axis=1))

        dkl_sdBias = .5 * (np.exp(sdNeuron) - 1)
        dkl_sdWeight = np.matmul(np.expand_dims(hNeuron, axis=-1), np.expand_dims(dkl_sdBias, axis=1))

        dkl_b1 = drelu * (dkl_sdBias.dot(sd_Weight.T) + dkl_muBias.dot(mu_Weight.T))
        dkl_W1 = np.matmul(np.expand_dims(inputs, axis=-1), np.expand_dims(dkl_b1, axis=1))
        
        grad_b_logvar = dkl_sdBias + d_sdBias
        grad_W_logvar = dkl_sdWeight + d_sdWeight
        grad_b_mu = dkl_muBias + d_muBias
        grad_W_mu = dkl_muWeight + d_muWeight
        grad_b1 = dkl_b1 + db1
        grad_W1 = dkl_W1 + dW1     

        grads = [W3, b3, W2, b2, W1, b1, mu_Weight, mu_Bias, sd_Weight, sd_Bias]
        optimise(grads)        

        klbloss = -0.5 * np.sum(1 + sdNeuron - muNeuron**2 - np.exp(2 * sdNeuron)) / batch_size

        #loss = -0.5 * np.sum(1 + self.latent_logvar - self.latent_mu**2 - np.exp(self.latent_logvar)) / (self.batch_size * self.latent_dim)
        rec_loss = MSELoss(output, inputs)
        totalLoss = rec_loss + klbloss
        #print('---------------', index, '---------------')
        print(totalLoss)

    

0.10501941420016997


  res = 1 / (1 + np.exp(-clipped_value))


14272593.363857418


  W1 -= learning_rate * np.sum(grad_W1, axis=0)
  b1 -= learning_rate * np.sum(grad_b1, axis=0)
  sd_Weight -= learning_rate * np.sum(grad_W_logvar, axis = 0)
  sd_Bias -= learning_rate * np.sum(grad_b_logvar, axis = 0)


4.637032573737365e+132
nan
nan
nan
nan


KeyboardInterrupt: 

In [167]:
import numpy as np

for epoch in range(epochs):
    index = 0
    for labels, inputs in get_training_samples(batch_size):
        index += 1
        #print(index)
        inputs = np.array(inputs)
        #print(inputs.shape)

        targets = inputs.copy()

        hNeuron = sigmoid(np.dot(inputs, W1) + b1)

        #print(mu_Weight.shape)
        #muNeuron = sigmoid(np.dot(hNeuron, mu_Weight) + mu_Bias)
        #sdNeuron = sigmoid(np.dot(hNeuron, sd_Weight) + sd_Bias)

        muNeuron = (np.dot(hNeuron, mu_Weight) + mu_Bias)
        sdNeuron = (np.dot(hNeuron, sd_Weight) + sd_Bias)
        
        #print(muNeuron.shape, batch_size)     
        
        randomSample = np.random.standard_normal(size=(batch_size, muNeuron.shape[1]))
        zSpace = muNeuron + np.exp(sdNeuron * 0.5) * randomSample

        # Decoder
        hdNeuron = sigmoid(np.dot(zSpace, W2) + b2)
        output = sigmoid(np.dot(hdNeuron, W3) + b3)

        # Decoder Backpropagation
        error = dloss = 2 * (output - inputs)outputSig = dL * dSig
        #print(error)
        d_output = error * output * (1 - output)
        #d_output = error * (output > 0)
        d_b3 = np.sum(d_output, axis=0)
        d_W3 = np.dot(hdNeuron.T, d_output)
        
        d_hdNeuron = np.dot(d_output, W3.T) * hdNeuron * (1 - hdNeuron)
        #d_hdNeuron = np.dot(d_output, W3.T) * (hdNeuron > 0 )
        d_b2 = np.sum(d_hdNeuron, axis=0)
        d_W2 = np.dot(zSpace.T, d_hdNeuron)
        
        d_zSpace = np.dot(d_hdNeuron, W2.T)


        # Encoder Backpropagation 
        #print(d_zSpace.shape)        
        d_muNeuron = d_zSpace
        d_muBias = np.sum(d_muNeuron, axis = 0)
        d_muWeight = np.dot(hNeuron.T, d_muNeuron) 

        d_sdNeuron = d_zSpace * np.exp(sdNeuron * .5) * .5 * randomSample
        d_sdBias = np.sum(d_sdNeuron, axis = 0)
        d_sdWeight = np.dot(hNeuron.T, d_sdNeuron)

        hNeuronDerivative = hNeuron * (1 - hNeuron)
        #hNeuronDerivative = (hNeuron > 0 )
        dhNeuron = hNeuronDerivative * (np.dot(d_muBias, mu_Weight.T) + np.dot(d_sdNeuron, sd_Weight.T))  
        dW1 = np.dot(inputs.T, dhNeuron)
        db1 = np.sum(dhNeuron, axis = 0)

        dk1_muNeuron = .5 * 2 * muNeuron
        dkl_muBias = np.sum(dk1_muNeuron, axis = 0)
        dkl_muWeight = np.dot(hNeuron.T, dk1_muNeuron) 

        dk1_sdNeuron = .5 * (np.exp(sdNeuron) - 1)
        dkl_sdBias = np.sum(dk1_sdNeuron, axis = 0)
        dkl_sdWeight = np.dot(hNeuron.T, dk1_sdNeuron)
        
        hNeuronDerivative = hNeuron * (1 - hNeuron)
        #hNeuronDerivative = (hNeuron > 0 )
        dkl_hNeuron = hNeuronDerivative * (np.dot(dk1_muNeuron, mu_Weight.T) + np.dot(dkl_sdBias, sd_Weight.T))
        dkl_W1 = np.dot(inputs.T, dkl_hNeuron)
        dkl_b1 = np.sum(dkl_hNeuron, axis = 0)

        grad_b_logvar = dkl_sdBias + d_sdBias
        grad_W_logvar = dkl_sdWeight + d_sdWeight
        grad_b_mu = dkl_muBias + d_muBias
        grad_W_mu = dkl_muWeight + d_muWeight
        grad_b1 = dkl_b1 + db1
        grad_W1 = dkl_W1 + dW1        

        d_W3 = clip_gradient(d_W3)
        d_b3 = clip_gradient(d_b3)
        d_W2 = clip_gradient(d_W2)
        d_b2 = clip_gradient(d_b2)
        grad_W1 = clip_gradient(grad_W1)
        grad_b1 = clip_gradient(grad_b1)
        grad_W_mu = clip_gradient(grad_W_mu)
        grad_b_mu = clip_gradient(grad_b_mu)
        grad_W_logvar = clip_gradient(grad_W_logvar)
        grad_b_logvar = clip_gradient(grad_b_logvar)

        grads = [grad_W1, grad_b1, grad_W_mu, grad_b_mu, grad_W_logvar, grad_b_logvar, d_W2, d_b2, d_W3, d_b3]
        optimizer = AdamOptimizer(parameters=params, learning_rate=0.001)
        optimizer.update(params, grads)
        
        # Update weights and biases
        W3 -= learning_rate * d_W3
        b3 -= learning_rate * d_b3
        W2 -= learning_rate * d_W2
        b2 -= learning_rate * d_b2
        W1 -= learning_rate * grad_W1
        b1 -= learning_rate * grad_b1
        mu_Weight -= learning_rate * grad_W_mu
        mu_Bias -= learning_rate * grad_b_mu
        sd_Weight -= learning_rate * grad_W_logvar
        sd_Bias -= learning_rate * grad_b_logvar

        #klbloss = -0.5 * np.sum(1 + sdNeuron - muNeuron**2 - np.exp(sdNeuron)) / (batch_size * muNeuron.shape[1])

        klbloss = -0.5 * np.sum(1 + 2 * sdNeuron - muNeuron**2 - np.exp(2 * sdNeuron)) / batch_size


        #loss = -0.5 * np.sum(1 + self.latent_logvar - self.latent_mu**2 - np.exp(self.latent_logvar)) / (self.batch_size * self.latent_dim)
        loss = np.mean([log_loss(a, t) for a, t in zip(output, targets)])
        totalLoss = loss + klbloss
        print('---------------', index, '---------------')
        #print(loss)
        #print(klbloss)
        print(totalLoss)

    

SyntaxError: invalid syntax (3442248419.py, line 32)