In [1]:
import sys
sys.path.append("..")

In [2]:
# import libraries
from layers import mlp, Patches, PatchEncoder
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

2022-04-15 14:37:20.173729: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [17]:
num_classes = 100
input_shape = (32, 32, 3)

learning_rate = 0.001
weight_decay = 0.0001
batch_size = 256
num_epochs = 100
image_size = 72  # We'll resize input images to this size
patch_size = 6  # Size of the patches to be extract from the input images
num_patches = (image_size // patch_size) ** 2
projection_dim = 64
num_heads = 4
transformer_units = [
    projection_dim * 2,
    projection_dim,
]  # Size of the transformer layers
transformer_layers = 8
mlp_head_units = [2048, 1024]

In [18]:
# create architecture
def tag_cnn_architecture():
    inputs = layers.Input(shape=input_shape)
    # Create patches.
    patches = Patches(patch_size)(inputs)
    # Encode patches.
    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)

    # Create multiple layers of the Transformer block.
    for _ in range(transformer_layers):
        # Layer normalization 1.
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
        # Create a multi-head attention layer.
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
        )(x1, x1)
        # Skip connection 1.
        x2 = layers.Add()([attention_output, encoded_patches])
        # Layer normalization 2.
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        # MLP.
        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
        # Skip connection 2.
        encoded_patches = layers.Add()([x3, x2])
    
    
    # get patch attention scores
    x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
    _, attention_weights = layers.MultiHeadAttention(num_heads=1, key_dim=projection_dim, 
                                                     dropout=0.1)(x1, x1, return_attention_scores=True)
    
    print(attention_weights.shape)
    
    
    # Create a [batch_size, projection_dim] tensor.
    representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
    representation = layers.Flatten()(representation)
    representation = layers.Dropout(0.5)(representation)
    # Add MLP.
    features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.5)
    # Classify outputs.
    logits = layers.Dense(num_classes)(features)
    # Create the Keras model.
    model = keras.Model(inputs=inputs, outputs=logits)
    return model

In [19]:
model = tag_cnn_architecture()

(None, 1, 144, 144)


In [9]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 32, 32, 3)]  0                                            
__________________________________________________________________________________________________
patches_1 (Patches)             (None, None, 108)    0           input_2[0][0]                    
__________________________________________________________________________________________________
patch_encoder_1 (PatchEncoder)  (None, 144, 64)      16192       patches_1[0][0]                  
__________________________________________________________________________________________________
layer_normalization_17 (LayerNo (None, 144, 64)      128         patch_encoder_1[0][0]            
______________________________________________________________________________________________