In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [None]:
import tensorflow_probability as tfp

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

In [None]:
from scipy import random

In [None]:
import os
import time

In [None]:
@tf.function
def squash(x, axis=-1):
    s_squared_norm = tf.math.reduce_sum(tf.math.square(x), axis, keepdims=True) + keras.backend.epsilon()
    scale = tf.math.sqrt(s_squared_norm) / (1 + s_squared_norm)
    return scale * x

@tf.function
def margin_loss(y_true, y_pred):
    lamb, margin = 0.5, 0.1
    return tf.math.reduce_sum((y_true * tf.math.square(tf.nn.relu(1 - margin - y_pred)) + lamb * (
        1 - y_true) * tf.math.square(tf.nn.relu(y_pred - margin))), axis=-1)

#@tf.function
def safe_norm(s, axis=-1, epsilon=1e-7, keep_dims=False):
        squared_norm = tf.reduce_sum(tf.square(s),axis=axis,keepdims=keep_dims)
        return tf.sqrt(squared_norm + epsilon)

In [None]:
class Capsule(keras.layers.Layer):
   

    def __init__(self,
                 num_capsule,
                 dim_capsule,
                 routings=3,
                 **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.caps_n = num_capsule
        self.caps_dim = dim_capsule
        self.r = routings

    def get_config(self):
        config = super().get_config().copy()
        config.update({
        'num_capsule':  self.caps_n,
        'dim_capsule' : self.caps_dim,
        'routings':  self.r,      
        })
        return config

    def build(self, input_shape):

        batch_size = input_shape[0]
        n=input_shape[1]
        k=self.caps_n
        d=self.caps_dim

        self.W = self.add_weight(name='W',
                    shape=[1, input_shape[1], self.caps_n, self.caps_dim, input_shape[-1]],
                    dtype=tf.float64,
                    initializer='glorot_uniform',
                    trainable=True)
        
        #initialization step.
        init_mu = random.rand(batch_size,k, d)*20 - 10
        self.mu = init_mu #initializing mean.

        init_sigma = np.zeros((k, d, d))
        for i in range(k):
            init_sigma[i] = np.eye(d)
        sigma = init_sigma
        sigma=tf.expand_dims(sigma,axis=0)
        self.sigma=tf.tile(sigma,[batch_size,1,1,1]) # initializing cov matrix.

        init_pi = np.ones(k)/k
        pi = init_pi
        pi=tf.expand_dims(pi,axis=0)
        self.pi=tf.tile(pi,[batch_size,1])

        R=np.zeros(shape=(n,k))
        R=tf.expand_dims(R,axis=0)
        self.R=tf.tile(R,[batch_size,1,1]) # coupling coefficient.
        
    def call(self, input_tensor):
        assert input_tensor.shape[2]==self.caps_dim
        input_tensor=tf.cast(input_tensor,dtype=tf.float64)
        assert input_tensor.dtype==tf.float64
        batch_size = input_tensor.shape[0]
        n=input_tensor.shape[1]
        k=self.caps_n
        d=self.caps_dim
        
        W_tiled = tf.tile(self.W, [batch_size, 1, 1, 1, 1]) # replicating the weights for parallel processing of a batch.
        """ W_tiled.shape=[batch_size,caps_n(i-1),caps_n(i),caps_dim(i),caps_dim(i-1)] """

        caps_output_expanded = tf.expand_dims(input_tensor, -1) # converting last dim to a column vector.
        """ the above step change the input shape from 
            [batch_size,caps_n(i-1),caps_dim(i-1)] --> [batch_size,caps_n(i-1),caps_dim(i-1),1]"""

        caps_output_tile = tf.expand_dims(caps_output_expanded, 2)
        """ the above step change the input shape from 
            [batch_size,caps_n(i-1),caps_dim(i-1),1] --> [batch_size,caps_n(i-1),1,caps_dim(i-1),1]"""

        caps_output_tiled = tf.tile(caps_output_tile, [1, 1, self.caps_n, 1, 1]) # replicating the input capsule vector for every output capsule.
        """ i.e [batch_size,caps_n(i-1),1,caps_dim(i-1),1] --> [batch_size,caps_n(i-1),caps_n(i),1,caps_dim(i-1),1]"""

        caps_predicted = tf.matmul(W_tiled, caps_output_tiled) # this is performing element wise tf.matmul() operation.
        """ caps_predicted.shape = [1,caps_n(i-1),caps_n(i),caps_dim(i),1]"""

        """ dynamic routing """
        """#initialization step.
        init_mu = random.rand(batch_size,k, d)*20 - 10
        mu = init_mu #initializing mean.

        init_sigma = np.zeros((k, d, d))
        for i in range(k):
            init_sigma[i] = np.eye(d)
        sigma = init_sigma
        sigma=tf.expand_dims(sigma,axis=0)
        sigma=tf.tile(sigma,[batch_size,1,1,1]) # initializing cov matrix.

        init_pi = np.ones(k)/k
        pi = init_pi
        pi=tf.expand_dims(pi,axis=0)
        pi=tf.tile(pi,[batch_size,1])

        R=np.zeros(shape=(n,k))
        R=tf.expand_dims(R,axis=0)
        R=tf.tile(R,[batch_size,1,1]) # coupling coefficient."""

        pi=tf.Variable(self.pi,dtype=tf.float64)
        mu=tf.Variable(self.mu,dtype=tf.float64)
        sigma=tf.Variable(self.sigma,dtype=tf.float64)
        R=tf.Variable(self.R,dtype=tf.float64)

        #print(mu.shape,pi.shape,sigma.shape,R.shape)

        N=np.zeros((batch_size,n))
        N=tf.Variable(N,dtype=tf.float64)

        r=self.r
        while(r):
          r=r-1
          # E-step.
          
          x_tmp=tf.expand_dims(input_tensor,axis=1) # x.shape==[b,n,d]
          x_tmp=tf.tile(x_tmp,[1,k,1,1]) # x_tmp.shape==[b,k,n,d]

          mu_tmp=tf.expand_dims(mu,axis=2) # mu.shape==[b,k,d]
          mu_tmp=tf.tile(mu_tmp,[1,1,n,1])   # mu_tmp.shape==[b,k,n,d]

          sig_tmp=tf.expand_dims(sigma,axis=2) # sigma.shape==[b,k,d,d]
          sig_tmp=tf.tile(sig_tmp,[1,1,n,1,1])   # sig_tmp.shape == [b,k,n,d,d]

          #print(x_tmp.shape,mu_tmp.shape,sig_tmp.shape)

          N = tfp.distributions.MultivariateNormalFullCovariance(loc=mu_tmp,covariance_matrix=sig_tmp).prob(x_tmp)
          #print(N.shape)
          N = pi[:,:,None]*N
          N = N/tf.expand_dims(tf.reduce_sum(N,axis=1),axis=1)
          R = tf.transpose(N,perm=[0,2,1])

          # M-step
          
          # updating pi.
          N_k = tf.reduce_sum(R,axis=1)
          pi = N_k/n

          # updating mu.
          mu = tf.matmul(tf.transpose(R,perm=[0,2,1]),input_tensor)
          mu = mu/N_k[:,:,None]

          # updating sigma.

          mu_tmp=tf.expand_dims(mu,axis=2)
          mu_tmp=tf.tile(mu_tmp,[1,1,n,1])

          x_tmp=x_tmp-mu_tmp
          x_tmp=tf.reshape(x_tmp,[batch_size,k,n,d,1])
          x_tmp_T=tf.transpose(x_tmp,perm=[0,1,2,4,3])
          res = tf.matmul(x_tmp,x_tmp_T)

          R_T=tf.transpose(R,perm=[0,2,1])

          res = tf.multiply(tf.reshape(R_T,[batch_size,k,n,1,1]),res)
          sigma = tf.reduce_sum(res,axis=2)/tf.reshape(N_k,[batch_size,k,1,1])
              
        weighted_prediction=tf.multiply(caps_predicted,tf.reshape(R,[batch_size,n,k,1,1]))
        weighted_sum = tf.reduce_sum(weighted_prediction, axis=1, keepdims=True)
        v=squash(weighted_sum, axis=-2)
        v = tf.squeeze(v, axis=[1,4])
        return v

    def compute_output_signature(self,input_shape):
      return tf.TensorSpec(shape=[input_shape[0],self.caps_n,self.caps_dim],dtype=tf.float64)

In [None]:
c1=tf.keras.layers.Conv2D(16,kernel_size=5,strides=1,padding='valid',activation='relu')
c2=tf.keras.layers.Conv2D(32,kernel_size=9,strides=1,padding='valid',activation='relu')
bn1=tf.keras.layers.BatchNormalization()
bn2=tf.keras.layers.BatchNormalization()
last=Capsule(10,16)

In [None]:
model_input = keras.Input(shape=(28,28,1), batch_size=10)
x=c1(model_input)
x=bn1(x,training=True)
x=c2(x)
x=bn2(x,training=True)
x=tf.reshape(x,[-1,16*32,16])
x=last(x)
x=tf.cast(x,tf.float32)
x=safe_norm(x, axis=2)
model_output = x

In [None]:
model = keras.Model(model_input, model_output, name="encoder")

In [None]:
adam = tf.keras.optimizers.Adam(learning_rate=0.0001) 

model.compile(loss=margin_loss, optimizer=adam, metrics=tf.keras.metrics.CategoricalAccuracy())
model.summary()

Model: "encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(10, 28, 28, 1)]         0         
                                                                 
 conv2d_4 (Conv2D)           (10, 24, 24, 16)          416       
                                                                 
 batch_normalization_4 (Batc  (10, 24, 24, 16)         64        
 hNormalization)                                                 
                                                                 
 conv2d_5 (Conv2D)           (10, 16, 16, 32)          41504     
                                                                 
 batch_normalization_5 (Batc  (10, 16, 16, 32)         128       
 hNormalization)                                                 
                                                                 
 tf.reshape_3 (TFOpLambda)   (10, 512, 16)             0   

In [None]:
# data loading in appropriate formate

mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

# Add a channels dimension
x_train = x_train[..., tf.newaxis].astype("float32")
x_test = x_test[..., tf.newaxis].astype("float32")

y_train=tf.keras.utils.to_categorical(y_train)
y_test=tf.keras.utils.to_categorical(y_test)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
validation_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
BATCH_SIZE = 10
SHUFFLE_BUFFER_SIZE = 100

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
validation_dataset = validation_dataset.batch(BATCH_SIZE)

In [None]:
"""customize training loop."""

# Instantiate an optimizer to train the model.
base_learning_rate = 0.0001
optimizer = tf.keras.optimizers.Adam(learning_rate=base_learning_rate)
# Instantiate a loss function.
loss_fn = margin_loss

# Prepare the metrics.
train_acc_metric = tf.keras.metrics.CategoricalAccuracy()
val_acc_metric = tf.keras.metrics.CategoricalAccuracy()

In [None]:
epochs = 30
for epoch in range(epochs):
    print("\nepoch {}/{}".format(epoch+1,epochs))
    pbar = keras.utils.Progbar(target=int(train_dataset.cardinality()))
    metrics = {}

    # Iterate over the batches of the dataset.
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        y_true = y_batch_train
        with tf.GradientTape() as tape:
            y_pred=model(x_batch_train) # $ better design needed.
            # y_pred is prob. dist.
            loss_value = loss_fn(y_true,y_pred) # loss computation
        grads = tape.gradient(loss_value, model.trainable_weights) # back prop
        optimizer.apply_gradients(zip(grads, model.trainable_weights)) # weight update

        # Update training metric.
        train_acc_metric.update_state(y_true, y_pred)
        metrics.update({'train_acc':train_acc_metric.result()})
        pbar.update(step+1, values=metrics.items(), finalize=False)


    # Run a validation loop at the end of each epoch.
    for x_batch_val, y_batch_val in validation_dataset:
      val_pred = model(x_batch_val) # $ better design needed
      # Update val metrics
      val_acc_metric.update_state(y_batch_val, val_pred)

    metrics.update({'val_acc':val_acc_metric.result()})
    
    pbar.update(step+1, values=metrics.items(), finalize=True)
    
    # Reset training & val metrics at the end of each epoch
    train_acc_metric.reset_states()
    val_acc_metric.reset_states()


epoch 1/30
  62/6000 [..............................] - ETA: 56:05 - train_acc: 0.1163

KeyboardInterrupt: ignored