# **Using CapsNet for Video Classification**

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

## **Load the datasets**

In [None]:
# Initialize labels
LABELS = set(["Abuse", "Assault", "Fighting", "Normal", "Robbery", "Vandalism"])

# Initialize the list of images
print("Loading images:")
imagePaths = list(paths.list_images(r'C:\Users\Yash Umale\Documents\6th Sem\Open Lab\Python Files\Crime Detection\Datasets'))

data = []
labels = []

# Loop over the image paths
for imagePath in imagePaths:
    label = imagePath.split(os.path.sep)[-2]

    if label not in LABELS:
        continue
    
    image = cv2.imread(imagePath)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = cv2.resize(image, (224, 224))
    
    data.append(image)
    labels.append(label)

np.array(labels)
np.array(data)

In [None]:
(trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.25, stratify=labels, random_state=42)

# Initialize the training data augmentation object
trainAug = ImageDataGenerator(rotation_range=30, 
                              zoom_range=0.15, 
                              width_shift_range=0.2, 
                              height_shift_range=0.2, 
                              shear_range=0.15, 
                              horizontal_flip=True, 
                              fill_mode="nearest")

# Initialize the validation/testing data augmentation object 
valAug = ImageDataGenerator()

# Define the ImageNet mean subtraction (in RGB order) 
mean = np.array([123.68, 116.779, 103.939], dtype="float32")
trainAug.mean = mean
valAug.mean = mean
n_epochs = 50

In [None]:
X = tf.placeholder(shape = [None, 224, 224, 3], dtype = tf.float32, name = "X")

### The first layer will be composed of 256 maps of 104 x 104 capsules each.
Each capsule will output a 128 dimensional vector.

In [None]:
caps_n_maps = 256
caps1_n_caps = caps1_n_maps * 104 * 104                                              # 2768896 capsules
caps1_n_dims = 128

In [None]:
conv1 = tf.layers.Conv2D(X, name = "conv1", 
                        filters = 4096, 
                        kernel_size = 9,
                        strides = 1,
                        padding = "valid",
                        activation = tf.nn.relu)

conv2 = tf.layers.Conv2D(conv1, name = "conv2",
                        filters = caps1_n_maps * caps1_n_dims, 
                        kernel_size = 9, 
                        strides = 2,
                        padding = "valid",
                        activation = tf.nn.relu)

Since the kernel size is 9, the image is shrunk by (9 - 1 = 8) pixels after each Conv2D layer.

Hence, after two convolution layers we have (224, 224, 3) -> (216, 216, 3) -> (208, 208, 3).\
Moreover, as stride = 2, (208, 208, 3) -> (104, 104, 3)

### Output of the Conv2D layer:

Number of maps (256) * Vector dimensions per capsule (128) = 32768 feature maps for each capsule.\
Each feature map is 104 * 104.

In [None]:
caps1_raw = tf.reshape(conv2, [-1, caps1_n_caps, caps1_n_dims], name = "caps1_raw")

In [None]:
def squash(s, axis = -1, epsilon = 1e-7, name = None):
    with tf.name_scope(name, default_name="squash"):
        squared_norm = tf.reduce_sum(tf.square(s), axis=axis,
                                     keep_dims=True)
        safe_norm = tf.sqrt(squared_norm + epsilon)
        squash_factor = squared_norm / (1. + squared_norm)
        unit_vector = s / safe_norm
        return squash_factor * unit_vector

In [None]:
# Output of the first capsule layer
caps1_output = squash(caps1_raw, name="caps1_output")