# Deep Computer Vision Using Convolutional Neural Networks

## Implementing Convolutional Layers with Keras

In [1]:
## Get some sample images
from sklearn.datasets import load_sample_images
import tensorflow as tf

images = load_sample_images()['images']
images = tf.keras.layers.CenterCrop(height=70, width=120)(images)
images = tf.keras.layers.Rescaling(scale=1 / 255)(images)

2025-01-12 19:48:45.347131: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-12 19:48:45.485408: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736732925.539547    3929 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736732925.555388    3929 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-12 19:48:45.697557: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
images.shape
## 4d tensor
## 2 Images in tensor
## 70 is the height in pixels
## 120 is the width in pixels
## 3 is the amount of color channels(most likly RGB)

TensorShape([2, 70, 120, 3])

In [3]:
conv_layer = tf.keras.layers.Conv2D(filters=32, kernel_size=7)
fmaps = conv_layer(images)

I0000 00:00:1736732928.335240    3929 cuda_dnn.cc:529] Loaded cuDNN version 90300


In [4]:
fmaps.shape
## 32 channels (because of the filters)
## The height and width shrunk because Conv2d doesn't zero pad by defualt
## 2 images

TensorShape([2, 64, 114, 32])

In [5]:
conv_layer = tf.keras.layers.Conv2D(filters=32, kernel_size=7,
                                   padding='same') ## Zero padding
fmaps = conv_layer(images)

In [6]:
fmaps.shape

TensorShape([2, 70, 120, 32])

In [7]:
kernels, biases = conv_layer.get_weights()
biases.shape
## 1d shape [output_channels]

(32,)

In [8]:
kernels.shape
## 4d shape [kernel_height, kernel_width, input_channels, output_channels]

(7, 7, 3, 32)

<b>You should also add activation functions and kernel_initilizers to Convolutional layers, for the same reasons as Dense layers

## Pooling Layers

Pooling layers are ment to subsample(i.e. shrink) the input image in order to reduce the computational load, the memory usage, and the number of parameters(thereby limiting the risk of overfitting)

### Pooling Layers in Keras

In [9]:
## Max Pool
max_pool = tf.keras.layers.MaxPool2D(pool_size=2)
## AveragePool
avg_pool = tf.keras.layers.AvgPool2D(pool_size=2)

In [10]:
### Depthwise pooling layer
class DepthPool(tf.keras.layers.Layer):
    def __init__(self, pool_size=2, **kwargs):
        super().__init__(**kwargs)
        self.pool_size = pool_size

    def call(self, inputs):
        shape = tf.shape(inputs)
        groups = shape[-1] // self.pool_size ## number of channel groups
        new_shape = tf.concat([shape[:-1], [groups, self.pool_size]], axis=0)
        return tf.reduce_max(tf.reshape(inputs, new_shape), axis=-1)

In [11]:
## Global average pooling
global_avg_pool = tf.keras.layers.GlobalAvgPool2D()

In [12]:
global_avg_pool(images)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[0.643388  , 0.59718215, 0.5825038 ],
       [0.7630747 , 0.26010972, 0.10848834]], dtype=float32)>

## CNN Architectures

In [13]:
# extra code – loads the mnist dataset, add the channels axis to the inputs,
#              scales the values to the 0-1 range, and splits the dataset
import numpy as np

mnist = tf.keras.datasets.fashion_mnist.load_data()
(X_train_full, y_train_full), (X_test, y_test) = mnist
X_train_full = np.expand_dims(X_train_full, axis=-1).astype(np.float32) / 255
X_test = np.expand_dims(X_test.astype(np.float32), axis=-1) / 255
X_train, X_valid = X_train_full[:-5000], X_train_full[-5000:]
y_train, y_valid = y_train_full[:-5000], y_train_full[-5000:]

### Basic CNN Structure

In [14]:
## Used on Fashion MNIST
from functools import partial

DefaultConv2d = partial(tf.keras.layers.Conv2D, kernel_size=3, padding='same',
                       activation='relu', kernel_initializer='he_normal')
model = tf.keras.Sequential([
    ### This first conv layer should capture lower level features so less filters
    DefaultConv2d(filters=64, kernel_size=7, input_shape=[28,28,1]),
    tf.keras.layers.MaxPool2D(),
    DefaultConv2d(filters=128),
    DefaultConv2d(filters=128),
    tf.keras.layers.MaxPool2D(),
    ### This conv layers should capture the high level features so have more filters
    DefaultConv2d(filters=256),
    DefaultConv2d(filters=256),
    tf.keras.layers.MaxPool2D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu', 
                          kernel_initializer='he_normal'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu', 
                          kernel_initializer='he_normal'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(10, activation='softmax')
])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [15]:
model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy,
             optimizer='nadam',
             metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_valid, y_valid))

Epoch 1/5


I0000 00:00:1736732930.538899    4011 service.cc:148] XLA service 0x721860012720 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1736732930.539032    4011 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2025-01-12 19:48:50.606743: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.


[1m  34/1719[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m8s[0m 5ms/step - accuracy: 0.1678 - loss: 2.7955    

I0000 00:00:1736732933.144613    4011 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 7ms/step - accuracy: 0.6103 - loss: 1.0985 - val_accuracy: 0.8688 - val_loss: 0.3637
Epoch 2/5
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.8516 - loss: 0.4461 - val_accuracy: 0.8922 - val_loss: 0.3063
Epoch 3/5
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.8790 - loss: 0.3557 - val_accuracy: 0.8846 - val_loss: 0.3033
Epoch 4/5
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.8904 - loss: 0.3158 - val_accuracy: 0.8988 - val_loss: 0.2685
Epoch 5/5
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9004 - loss: 0.2934 - val_accuracy: 0.9094 - val_loss: 0.2524


### List of other Architectures
- LeNet-5
- AlexNet
- GoogLeNet
- VGGNet
- ResNet
- Xception(A variant of GoogLeNet)
- SENet
- ResNeXt
- DenseNet
- MobileNet
- CSPNet
- EfficientNet

## Implementing a ResNet-34 CNN

In [16]:
DefaultConv2D = partial(tf.keras.layers.Conv2D, kernel_size=3, strides=1,
                       padding='same', kernel_initializer='he_normal',
                       use_bias=False)

class ResidualUnit(tf.keras.layers.Layer):
    def __init__(self, filters, strides=1, activation='relu', **kwargs):
        super().__init__(**kwargs)
        self.activation = tf.keras.activations.get(activation)
        self.main_layers = [
            DefaultConv2D(filters, strides=strides),
            tf.keras.layers.BatchNormalization(),
            self.activation,
            DefaultConv2D(filters),
            tf.keras.layers.BatchNormalization()
        ]
        self.skip_layers = []
        if strides > 1:
            self.skip_layers = [
                DefaultConv2D(filters, kernel_size=1, strides=strides),
                tf.keras.layers.BatchNormalization()
            ]

    def call(self, inputs):
        Z = inputs 
        for layer in self.main_layers:
            Z = layer(Z)
        skip_Z = inputs
        for layer in self.skip_layers:
            skip_Z = layer(skip_Z)
        return self.activation(Z + skip_Z)

In [17]:
res_net_34 = tf.keras.Sequential([
    DefaultConv2D(64, kernel_size=7, strides=2, input_shape=[224, 224, 3]),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.MaxPool2D(pool_size=3, strides=2, padding='same')
])
prev_filters = 64
for filters in [64] * 3 + [128] * 4 + [256] * 6 + [512] * 3:
    strides = 1 if filters == prev_filters else 2
    res_net_34.add(ResidualUnit(filters, strides=strides))
    prev_filters = filters
    
res_net_34.add(tf.keras.layers.GlobalAvgPool2D())
res_net_34.add(tf.keras.layers.Flatten())
res_net_34.add(tf.keras.layers.Dense(10, activation='softmax'))

## Using Pretrained Models from Keras

In [18]:
model = tf.keras.applications.ResNet50(weights='imagenet')

In [19]:
images = load_sample_images()['images']
images_resized = tf.keras.layers.Resizing(height=224, width=224, crop_to_aspect_ratio=True)(images)

In [20]:
inputs = tf.keras.applications.resnet50.preprocess_input(images_resized)

In [21]:
y_proba = model.predict(inputs)
y_proba.shape

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step


(2, 1000)

In [22]:
top_K = tf.keras.applications.resnet50.decode_predictions(y_proba, top=4)
for image_index in range(len(images)):
    print(f'Image #{image_index}')
    for class_id, name, y_proba in top_K[image_index]:
        print(f'  {class_id} - {name:12s} {y_proba:.2%}')

Image #0
  n03598930 - jigsaw_puzzle 30.61%
  n02782093 - balloon      17.16%
  n03888257 - parachute    5.58%
  n06359193 - web_site     3.84%
Image #1
  n04209133 - shower_cap   34.36%
  n09229709 - bubble       11.40%
  n02782093 - balloon      9.46%
  n07745940 - strawberry   4.94%


## Pretrained Models for Transfer Learning

If you want to build an image classifier but don't have enough data to train it from scratch, it is often a good idea to reuse the lower layers of a pretrained model. <br>
For this example we will reuse parts of the Xception model

In [24]:
import tensorflow_datasets as tfds

dataset,info = tfds.load('tf_flowers', as_supervised=True, with_info=True)
dataset_size = info.splits['train'].num_examples
class_names = info.features['label'].names
n_classes = info.features['label'].num_classes

In [26]:
test_set_raw, valid_set_raw, train_set_raw = tfds.load(
    "tf_flowers",
    split=["train[:10%]", "train[10%:25%]", "train[25%:]"],
    as_supervised=True)

In [27]:
## Before you can batch, you need to make sure all the images have same dimensions
batch_size = 32
preprocess = tf.keras.Sequential([
    tf.keras.layers.Resizing(height=224, width=224, crop_to_aspect_ratio=True),
    tf.keras.layers.Lambda(tf.keras.applications.xception.preprocess_input)
])

train_set = train_set_raw.map(lambda X, y: (preprocess(X), y))
train_set = train_set.shuffle(1000, seed=42).batch(batch_size).prefetch(1)
valid_set = valid_set_raw.map(lambda X, y: (preprocess(X), y)).batch(batch_size)
test_set = test_set_raw.map(lambda X, y: (preprocess(X), y)).batch(batch_size)

In [28]:
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip(mode="horizontal", seed=42),
    tf.keras.layers.RandomRotation(factor=0.05, seed=42),
    tf.keras.layers.RandomContrast(factor=0.2, seed=42)
])

In [29]:
## Load pretrained model without the top layer
base_model = tf.keras.applications.xception.Xception(weights='imagenet',
                                                    include_top=False)
avg = tf.keras.layers.GlobalAveragePooling2D()(base_model.output)
output = tf.keras.layers.Dense(n_classes, activation='softmax')(avg)
model = tf.keras.Model(inputs=base_model.input, outputs=output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m83683744/83683744[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step 


In [31]:
## Freeze the Xception model layers
for layer in base_model.layers:
    layer.trainable = False

In [32]:
optimizer = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer,
             metrics=['accuracy'])
history = model.fit(train_set, validation_data=valid_set, epochs=3)

Epoch 1/3


2025-01-12 20:00:20.071205: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:376] The default buffer size is 262144, which is overridden by the user specified `buffer_size` of 8388608





[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 0.7082 - loss: 1.0316




[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 137ms/step - accuracy: 0.7093 - loss: 1.0295 - val_accuracy: 0.8330 - val_loss: 0.6542
Epoch 2/3
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 80ms/step - accuracy: 0.9146 - loss: 0.3402 - val_accuracy: 0.8603 - val_loss: 0.6304
Epoch 3/3
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 80ms/step - accuracy: 0.9258 - loss: 0.2423 - val_accuracy: 0.8657 - val_loss: 0.5935


In [33]:
## Now lets unfreeze some of the layers and train again
for layer in base_model.layers[56:]:
    layer.trainable = True

In [34]:
optimizer = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer,
             metrics=['accuracy'])
history = model.fit(train_set, validation_data=valid_set, epochs=10)

Epoch 1/10
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 176ms/step - accuracy: 0.7958 - loss: 0.7019 - val_accuracy: 0.5172 - val_loss: 5.8457
Epoch 2/10
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 148ms/step - accuracy: 0.9143 - loss: 0.2761 - val_accuracy: 0.7350 - val_loss: 1.5171
Epoch 3/10
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 146ms/step - accuracy: 0.9608 - loss: 0.1260 - val_accuracy: 0.8730 - val_loss: 0.4024
Epoch 4/10
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 146ms/step - accuracy: 0.9694 - loss: 0.0894 - val_accuracy: 0.8693 - val_loss: 0.5044
Epoch 5/10
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 144ms/step - accuracy: 0.9739 - loss: 0.0785 - val_accuracy: 0.9038 - val_loss: 0.2958
Epoch 6/10
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 144ms/step - accuracy: 0.9889 - loss: 0.0353 - val_accuracy: 0.9093 - val_loss: 0.3785
Epoch 7/10
[1m86/86[

## Classification and Localization

Localization can be expressed as a regression task, to predict a bounding box around the object's center, predict the X cord, y cord, width and height of the box. Adding this part to the model is easy.

In [35]:
tf.random.set_seed(42)  # extra code – ensures reproducibility
base_model = tf.keras.applications.xception.Xception(weights="imagenet",
                                                     include_top=False)
avg = tf.keras.layers.GlobalAveragePooling2D()(base_model.output)
class_output = tf.keras.layers.Dense(n_classes, activation="softmax")(avg)
loc_output = tf.keras.layers.Dense(4)(avg)
model = tf.keras.Model(inputs=base_model.input,
                       outputs=[class_output, loc_output])
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)  
model.compile(loss=["sparse_categorical_crossentropy", "mse"],
              loss_weights=[0.8, 0.2],  # depends on what you care most about
              optimizer=optimizer, metrics=["accuracy", "mse"])

<b>The main problem is we don't have these boxes in our original dataset. You would most likly have to do this yourself or use crowdsourcing.

## Object Detection

The task of classifying and localizing multiple objects in an image is called <b>Object Detection</b>.