In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [2]:
from keras.engine import data_adapter

In [3]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

In [4]:
import os

In [5]:
@tf.function
def squash(v,epsilon=1e-7,axis=-1):
    sqnrm=tf.reduce_sum(tf.square(v), axis=axis,keepdims=True)
    nrm=tf.sqrt(sqnrm + epsilon) #safe norm to avoid divide by zero.
    sqsh_factor = sqnrm / (1. + sqnrm)
    unit_vect = v / nrm
    return sqsh_factor*unit_vect

@tf.function
def safe_norm(s, axis=-1, epsilon=1e-7, keep_dims=False):
        squared_norm = tf.reduce_sum(tf.square(s),axis=axis,keepdims=keep_dims)
        return tf.sqrt(squared_norm + epsilon)

In [6]:

#downloading data.
_URL = 'https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip'
path_to_zip = tf.keras.utils.get_file('cats_and_dogs.zip', origin=_URL, extract=True)
PATH = os.path.join(os.path.dirname(path_to_zip), 'cats_and_dogs_filtered')

train_dir = os.path.join(PATH, 'train')
validation_dir = os.path.join(PATH, 'validation')

BATCH_SIZE = 32
IMG_SIZE = (160, 160)

#train data
train_dataset = tf.keras.utils.image_dataset_from_directory(train_dir,
                                                            shuffle=True,
                                                            batch_size=BATCH_SIZE,
                                                            image_size=IMG_SIZE)

#validation model.
validation_dataset = tf.keras.utils.image_dataset_from_directory(validation_dir,
                                                                 shuffle=True,
                                                                 batch_size=BATCH_SIZE,
                                                                 image_size=IMG_SIZE)

""" #viewing some sample from the dataset.
class_names = train_dataset.class_names
plt.figure(figsize=(10, 10))
for images, labels in train_dataset.take(1):
  for i in range(3):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(images[i].numpy().astype("uint8"))
    plt.title(class_names[labels[i]])
    plt.axis("off")
    """

# creating test data.
val_batches = tf.data.experimental.cardinality(validation_dataset)
test_dataset = validation_dataset.take(val_batches // 5)
validation_dataset = validation_dataset.skip(val_batches // 5)

print('Number of validation batches: %d' % tf.data.experimental.cardinality(validation_dataset))
print('Number of test batches: %d' % tf.data.experimental.cardinality(test_dataset))


#y_train=tf.keras.utils.to_categorical(y_train)
#y_test=tf.keras.utils.to_categorical(y_test)

Downloading data from https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip
Found 2000 files belonging to 2 classes.
Found 1000 files belonging to 2 classes.
Number of validation batches: 26
Number of test batches: 6


In [7]:

#optimization parameter setting.
AUTOTUNE = tf.data.AUTOTUNE

train_dataset = train_dataset.prefetch(buffer_size=AUTOTUNE)
validation_dataset = validation_dataset.prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.prefetch(buffer_size=AUTOTUNE)


In [None]:
caps1_n_maps = 32
caps1_n_caps = caps1_n_maps * 6 * 6  # 1152 primary capsules
caps1_n_dims = 8

# digit capsule layer
caps2_n_caps = 10 # 10 capsule each digit.
caps2_n_dims = 16 # each of the 10 capsules are of 16 dims.


In [8]:
class Primary_caps_layer(tf.keras.layers.Layer):
  """ caps_n(i) --> no of capsule in ith layer 
      caps_dim(i) --> dimension of capsule in ith layer. 
      
      primary_caps_layer output shape = [batch_size,caps_n,caps_dim]"""

  def __init__(self,caps_n=1152,k1=256,k2=256,k_s1=9,k_s2=5,s1=1,s2=3):
    super(Primary_caps_layer, self).__init__()
    self.caps_n=caps_n  # no of capsule in this layer.
    #self.caps_dim=caps_dim # dim of each capsule in this layer
    self.k1=k1
    self.k2=k2
    self.k_s1=k_s1
    self.k_s2=k_s2
    self.s1=s1
    self.s2=s2
    self.conv1=tf.keras.layers.Conv2D(k1,kernel_size=k_s1,strides=s1,padding='valid',activation='relu') 
    self.conv2=tf.keras.layers.Conv2D(k2,kernel_size=k_s2,strides=s2,padding='valid',activation='relu')

  def call(self, input_tensor):
    x=self.conv1(input_tensor)
    x=self.conv2(x)
    assert x.shape[1]*x.shape[1]*self.k2==self.caps_n*self.caps_dim
    x=tf.reshape(x,[self.batch_size,self.caps_n,self.caps_dim])
    return squash(x)

  def build(self,input_shape):
    self.batch_size=input_shape[0]
    tmp=int(((input_shape[1]-self.k_s1)/self.s1))+1
    self.conv1_output_shape=[input_shape[0],tmp,tmp,self.k1]
    tmp=int(((tmp-self.k_s2)/self.s2))+1
    self.conv2_output_shape=[input_shape[0],tmp,tmp,self.k2]
    tmp1=tmp*tmp*self.k2
    self.caps_n=self.caps_n-(tmp1%self.caps_n)
    self.caps_dim=int((tmp*tmp*self.k2)/self.caps_n);
    
    

In [9]:
class Digit_caps_layer(tf.keras.layers.Layer):
  """ caps_n(i) --> no of capsule in ith layer 
      caps_dim(i) --> dimension of capsule in ith layer. 
      and we assume this is ith layer. 
      output.shape of ith layer = [batch_size, 1,caps_n(i),caps_dim(i), 1]"""

  def __init__(self,caps_dim=16,caps_n=10,r=3):
    super(Digit_caps_layer,self).__init__()
    self.caps_n=caps_n # no of capsule.
    self.caps_dim=caps_dim # dim of each capsule.
    self.r=r # no of iteration in routing by agreement algorithm.
    
  def build(self,input_shape): # input_shape = [batch_size,caps_n(i-1),caps_dim(i-1)] 
    self.W = tf.Variable(initial_value=tf.random.normal(
    shape=(1, input_shape[1], self.caps_n, self.caps_dim, input_shape[-1]),
    stddev=0.1, dtype=tf.float32),
    trainable=True)  #weigth initialization for this layer W.shape=[1,caps_n(i-1),caps_n(i),caps_dim(i),caps_dim(i-1)].

  def call(self,input_tensor): #input_tensor.shape=[batch_size,caps_n(i-1),caps_dim(i-1)]
    batch_size = input_tensor.shape[0]
    W_tiled = tf.tile(self.W, [batch_size, 1, 1, 1, 1]) # replicating the weights for parallel processing of a batch.
    """ W_tiled.shape=[batch_size,caps_n(i-1),caps_n(i),caps_dim(i),caps_dim(i-1)] """

    caps_output_expanded = tf.expand_dims(input_tensor, -1) # converting last dim to a column vector.
    """ the above step change the input shape from 
        [batch_size,caps_n(i-1),caps_dim(i-1)] --> [batch_size,caps_n(i-1),caps_dim(i-1),1]"""

    caps_output_tile = tf.expand_dims(caps_output_expanded, 2)
    """ the above step change the input shape from 
        [batch_size,caps_n(i-1),caps_dim(i-1),1] --> [batch_size,caps_n(i-1),1,caps_dim(i-1),1]"""

    caps_output_tiled = tf.tile(caps_output_tile, [1, 1, self.caps_n, 1, 1]) # replicating the input capsule vector for every output capsule.
    " i.e [batch_size,caps_n(i-1),1,caps_dim(i-1),1] --> [batch_size,caps_n(i-1),caps_n(i),1,caps_dim(i-1),1]"

    caps_predicted = tf.matmul(W_tiled, caps_output_tiled) # this is performing element wise tf.matmul() operation.
    """ caps_predicted.shape = [1,caps_n(i-1),caps_n(i),caps_dim(i),1]"""

    """ dynamic routing """
    raw_weights = tf.zeros([batch_size,input_tensor.shape[1] , self.caps_n, 1, 1]) # non trainable weights.
    """ raw_weights.shape=[batch_size,caps_n(i-1) ,caps_n(i), 1, 1]"""

    r=self.r
    while(r):
      r-=1
      routing_weights = tf.nn.softmax(raw_weights,axis=2)
      """ [batch_size,caps_n(i-1) ,caps_n(i), 1, 1]  softmax applied along the pointed dim.
                                       ^                                                   """

      weighted_predictions = tf.multiply(routing_weights, caps_predicted)
      """ weighted_predictions.shape = [batch_size, caps_n(i-1),caps_n(i),caps_dim(i), 1]"""

      weighted_sum = tf.reduce_sum(weighted_predictions, axis=1, keepdims=True)
      """ [batch_size,caps_n(i-1) ,caps_n(i),caps_dim(i), 1]  sum applied along the pointed dim.
                           ^                                                               
      therefore weighted_sum.shape=[batch_size,1 ,caps_n(i),caps_dim(i), 1]"""

      v = squash(weighted_sum, axis=-2) #normalize to unit length vector.
      v_tiled = tf.tile(v, [1, input_tensor.shape[1], 1, 1, 1])
      """ v_tiled.shape=[batch_size,caps_n(i-1),caps_n(i),caps_dim(i), 1]"""

      agreement = tf.matmul(caps_predicted, v_tiled,transpose_a=True)
      """ agreement.shape=[batch_size,caps_n(i-1),caps_n(i), 1, 1]"""

      if(r>0):
          routing_weights+=agreement
      else:
          return v

In [38]:
"""

class Caps_net(tf.keras.Model):

  def __init__(self,no_classes=10,batch_size=32):
    super(Caps_net,self).__init__()
    self.batch_size=batch_size
    self.pri_layer=Primary_caps_layer(caps_n=1152,k1=256,k2=256,k_s1=9,k_s2=5,s1=1,s2=3,batch_size=self.batch_size)
    #self.intrm_layer=Digit_caps_layer(caps_dim=8,caps_n=10,r=3)
    self.dig_layer=Digit_caps_layer(caps_dim=8,caps_n=no_classes,r=3,batch_size=self.batch_size)
    

  def call(self,input_tensor):
    x = self.pri_layer(input_tensor) #x.shape=[batch_size,caps_n(i),caps_dim(i)]
    #x = self.intrm_layer(x)
    #x = tf.squeeze(x, axis=[1,4])
    x = self.dig_layer(x) #x.shape=[batch_size, 1,caps_n(i),caps_dim(i), 1]

    """The lengths of the output vectors represent the class probabilities, 
       so we could just use tf.norm() to compute them,"""
    x = safe_norm(x, axis=-2) #x.shape=[batch_size,1,caps_n(i-1),1]

    x = tf.nn.softmax(x,axis=2) #converting those probabilities to prob dist.
    x = tf.squeeze(x, axis=[1,3]) #reducing the extra dims. therefore the output shape =[batch_size,caps_n(i-1)] 
    return x

  def build(self,input_shape):
    self.batch_size=input_shape[0];

  """ custom training loop """
  def train_step(self,data):
    x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
    y=tf.keras.utils.to_categorical(y)
    with tf.GradientTape() as tape:
        y_pred = self(x, training=True)  # Forward pass
        # Compute the loss value
        # (the loss function is configured in `compile()`)
        loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        # Update metrics (includes the metric that tracks the loss)
        self.compiled_metrics.update_state(y, y_pred)
        # Return a dict mapping metric names to current value
        return {m.name: m.result() for m in self.metrics}  
"""


In [39]:
#model=Caps_net(no_classes=2)

In [40]:
"""
model.compile(
          loss      = tf.keras.losses.CategoricalCrossentropy(from_logits=True),
          metrics   = tf.keras.metrics.CategoricalAccuracy(),
          optimizer = tf.keras.optimizers.Adam())
"""

In [10]:
#output of the tmp model. 
image_batch, label_batch = next(iter(train_dataset))



In [11]:
image_batch.shape

TensorShape([32, 160, 160, 3])

In [20]:
pri=Primary_caps_layer(caps_n=256,k1=64,k2=64,k_s1=9,k_s2=5,s1=1,s2=3)
dig=Digit_caps_layer(caps_dim=8,caps_n=2,r=3)

In [21]:
x=pri(image_batch)

In [22]:
x.shape

TensorShape([32, 256, 625])

In [23]:
x=dig(x)

In [24]:
x.shape

TensorShape([32, 1, 2, 8, 1])

In [26]:
x[0]

<tf.Tensor: shape=(1, 2, 8, 1), dtype=float32, numpy=
array([[[[-0.38722607],
         [-0.10100831],
         [-0.33626333],
         [-0.00649191],
         [-0.04269022],
         [ 0.5096647 ],
         [ 0.03618015],
         [ 0.17151405]],

        [[ 0.23256193],
         [-0.11151747],
         [ 0.28388774],
         [-0.5194642 ],
         [ 0.06661782],
         [-0.22748002],
         [-0.33919474],
         [-0.07836808]]]], dtype=float32)>

In [27]:
x = safe_norm(x, axis=-2) #x.shape=[batch_size,1,caps_n(i-1),1]
print(x.shape)
x = tf.nn.softmax(x,axis=2) #converting those probabilities to prob dist.
print(x.shape)
x = tf.squeeze(x, axis=[1,3]) #reducing the extra dims. therefore the output shape =[batch_size,caps_n(i-1)] 
print(x.shape)

(32, 1, 2, 1)
(32, 1, 2, 1)
(32, 2)


In [34]:
x

<tf.Tensor: shape=(32, 2), dtype=float32, numpy=
array([[0.49527785, 0.5047222 ],
       [0.48035017, 0.5196498 ],
       [0.49423286, 0.50576717],
       [0.49328157, 0.5067185 ],
       [0.5099131 , 0.49008685],
       [0.53202593, 0.4679741 ],
       [0.5188192 , 0.48118076],
       [0.48478884, 0.5152112 ],
       [0.5120314 , 0.48796862],
       [0.5013908 , 0.49860916],
       [0.50233954, 0.4976605 ],
       [0.51419985, 0.48580015],
       [0.49786547, 0.5021345 ],
       [0.4905074 , 0.50949264],
       [0.5292736 , 0.4707264 ],
       [0.50634205, 0.49365792],
       [0.4902343 , 0.5097657 ],
       [0.5024426 , 0.4975574 ],
       [0.4692447 , 0.53075534],
       [0.5054724 , 0.49452758],
       [0.4696343 , 0.53036565],
       [0.47749606, 0.5225039 ],
       [0.47772518, 0.52227485],
       [0.52168375, 0.47831622],
       [0.5056131 , 0.4943869 ],
       [0.516404  , 0.48359606],
       [0.48396993, 0.51603   ],
       [0.47038445, 0.5296156 ],
       [0.4950577 , 0.50494

In [14]:

class Caps_net(tf.keras.Model):

  def __init__(self,no_classes=10):
    super(Caps_net,self).__init__()
    self.pri_layer=Primary_caps_layer(caps_n=256,k1=64,k2=64,k_s1=9,k_s2=5,s1=1,s2=3)
    self.dig_layer=Digit_caps_layer(caps_dim=8,caps_n=no_classes,r=3)
    

  def call(self,input_tensor):
    x = self.pri_layer(input_tensor) #x.shape=[batch_size,caps_n(i),caps_dim(i)]
    x = self.dig_layer(x) #x.shape=[batch_size, 1,caps_n(i),caps_dim(i), 1]
    x = safe_norm(x, axis=-2) #x.shape=[batch_size,1,caps_n(i-1),1]
    x = tf.nn.softmax(x,axis=2) #converting those probabilities to prob dist.
    x = tf.squeeze(x, axis=[1,3]) #reducing the extra dims. therefore the output shape =[batch_size,caps_n(i-1)] 
    return x

  def train_step(self,data):
    x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
    y=tf.keras.utils.to_categorical(y)
    with tf.GradientTape() as tape:
        y_pred = self(x, training=True)  # Forward pass
        # Compute the loss value
        # (the loss function is configured in `compile()`)
        loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        # Update metrics (includes the metric that tracks the loss)
        self.compiled_metrics.update_state(y, y_pred)
        # Return a dict mapping metric names to current value
        return {m.name: m.result() for m in self.metrics}  


In [15]:
model=Caps_net(no_classes=2)

In [16]:
model(image_batch)

<tf.Tensor: shape=(32, 2), dtype=float32, numpy=
array([[0.47808638, 0.5219137 ],
       [0.5043702 , 0.49562973],
       [0.49857965, 0.50142026],
       [0.515404  , 0.484596  ],
       [0.48895067, 0.51104933],
       [0.5070465 , 0.49295345],
       [0.50374866, 0.49625137],
       [0.5033982 , 0.49660185],
       [0.4966536 , 0.50334644],
       [0.4947047 , 0.5052953 ],
       [0.48524812, 0.51475185],
       [0.4881652 , 0.5118348 ],
       [0.4909044 , 0.5090956 ],
       [0.50118744, 0.49881256],
       [0.5207015 , 0.47929847],
       [0.49168444, 0.50831556],
       [0.52116555, 0.47883442],
       [0.5291591 , 0.47084093],
       [0.5165412 , 0.4834588 ],
       [0.49379814, 0.5062019 ],
       [0.51503056, 0.48496944],
       [0.55607474, 0.44392523],
       [0.5044899 , 0.49551   ],
       [0.5177015 , 0.4822985 ],
       [0.5055316 , 0.4944684 ],
       [0.46362445, 0.53637546],
       [0.51964074, 0.48035932],
       [0.50169176, 0.49830827],
       [0.48984656, 0.51015

In [17]:
"""customize training loop."""

# Instantiate an optimizer to train the model.
base_learning_rate = 0.0001
optimizer = tf.keras.optimizers.Adam(learning_rate=base_learning_rate)
# Instantiate a loss function.
loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

# Prepare the metrics.
train_acc_metric = tf.keras.metrics.CategoricalAccuracy()
val_acc_metric = tf.keras.metrics.CategoricalAccuracy()

In [28]:
import time

epochs = 3
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))
    start_time = time.time()

    # Iterate over the batches of the dataset.
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        y_batch_train=tf.keras.utils.to_categorical(y_batch_train)
        if(x_batch_train.shape[0]==32): #@ instant hack (needed to be fixed.)
          with tf.GradientTape() as tape:
              logits = model(x_batch_train)
              loss_value = loss_fn(y_batch_train, logits)
          grads = tape.gradient(loss_value, model.trainable_weights)
          optimizer.apply_gradients(zip(grads, model.trainable_weights))

          # Update training metric.
          train_acc_metric.update_state(y_batch_train, logits)

          """# Log every 200 batches.
          if step % 200 == 0:
              print(
                  "Training loss (for one batch) at step %d: %.4f"
                  % (step, float(loss_value))
              )
              print("Seen so far: %d samples" % ((step + 1) * batch_size))"""

    # Display metrics at the end of each epoch.
    train_acc = train_acc_metric.result()
    print("Training acc over epoch: %.4f" % (float(train_acc),))

    # Reset training metrics at the end of each epoch
    train_acc_metric.reset_states()

    # Run a validation loop at the end of each epoch.
    for x_batch_val, y_batch_val in validation_dataset:
      if(x_batch_val.shape[0]==32): #@ instant hack (needed to be fixed.)
        val_logits = model(x_batch_val)
        # Update val metrics
        val_acc_metric.update_state(y_batch_val, val_logits)
    val_acc = val_acc_metric.result()
    val_acc_metric.reset_states()
    print("Validation acc: %.4f" % (float(val_acc),))
    print("Time taken: %.2fs" % (time.time() - start_time))


Start of epoch 0
Training acc over epoch: 0.5015
Validation acc: 0.2992
Time taken: 410.74s

Start of epoch 1
Training acc over epoch: 0.5318
Validation acc: 0.3762
Time taken: 362.87s

Start of epoch 2
Training acc over epoch: 0.5494
Validation acc: 0.6025
Time taken: 349.25s


In [21]:
step=10
batch_size=32
print("Seen so far: %d samples" % ((step + 1) * batch_size))

Seen so far: 352 samples
