# Deep Computer Vision Using Convolutional Neural Networks

## Implementing Convolutional Layers with Keras

In [1]:
## Get some sample images
from sklearn.datasets import load_sample_images
import tensorflow as tf

images = load_sample_images()['images']
images = tf.keras.layers.CenterCrop(height=70, width=120)(images)
images = tf.keras.layers.Rescaling(scale=1 / 255)(images)

2025-01-12 15:29:59.613574: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-12 15:29:59.792135: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736717399.856608    3985 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736717399.876772    3985 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-12 15:30:00.055834: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
images.shape
## 4d tensor
## 2 Images in tensor
## 70 is the height in pixels
## 120 is the width in pixels
## 3 is the amount of color channels(most likly RGB)

TensorShape([2, 70, 120, 3])

In [3]:
conv_layer = tf.keras.layers.Conv2D(filters=32, kernel_size=7)
fmaps = conv_layer(images)

I0000 00:00:1736717402.623201    3985 cuda_dnn.cc:529] Loaded cuDNN version 90300


In [4]:
fmaps.shape
## 32 channels (because of the filters)
## The height and width shrunk because Conv2d doesn't zero pad by defualt
## 2 images

TensorShape([2, 64, 114, 32])

In [5]:
conv_layer = tf.keras.layers.Conv2D(filters=32, kernel_size=7,
                                   padding='same') ## Zero padding
fmaps = conv_layer(images)

In [6]:
fmaps.shape

TensorShape([2, 70, 120, 32])

In [7]:
kernels, biases = conv_layer.get_weights()
biases.shape
## 1d shape [output_channels]

(32,)

In [8]:
kernels.shape
## 4d shape [kernel_height, kernel_width, input_channels, output_channels]

(7, 7, 3, 32)

<b>You should also add activation functions and kernel_initilizers to Convolutional layers, for the same reasons as Dense layers

## Pooling Layers

Pooling layers are ment to subsample(i.e. shrink) the input image in order to reduce the computational load, the memory usage, and the number of parameters(thereby limiting the risk of overfitting)

### Pooling Layers in Keras

In [9]:
## Max Pool
max_pool = tf.keras.layers.MaxPool2D(pool_size=2)
## AveragePool
avg_pool = tf.keras.layers.AvgPool2D(pool_size=2)

In [10]:
### Depthwise pooling layer
class DepthPool(tf.keras.layers.Layer):
    def __init__(self, pool_size=2, **kwargs):
        super().__init__(**kwargs)
        self.pool_size = pool_size

    def call(self, inputs):
        shape = tf.shape(inputs)
        groups = shape[-1] // self.pool_size ## number of channel groups
        new_shape = tf.concat([shape[:-1], [groups, self.pool_size]], axis=0)
        return tf.reduce_max(tf.reshape(inputs, new_shape), axis=-1)

In [11]:
## Global average pooling
global_avg_pool = tf.keras.layers.GlobalAvgPool2D()

In [12]:
global_avg_pool(images)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[0.643388  , 0.59718215, 0.5825038 ],
       [0.7630747 , 0.26010972, 0.10848834]], dtype=float32)>

## CNN Architectures

In [16]:
# extra code – loads the mnist dataset, add the channels axis to the inputs,
#              scales the values to the 0-1 range, and splits the dataset
import numpy as np

mnist = tf.keras.datasets.fashion_mnist.load_data()
(X_train_full, y_train_full), (X_test, y_test) = mnist
X_train_full = np.expand_dims(X_train_full, axis=-1).astype(np.float32) / 255
X_test = np.expand_dims(X_test.astype(np.float32), axis=-1) / 255
X_train, X_valid = X_train_full[:-5000], X_train_full[-5000:]
y_train, y_valid = y_train_full[:-5000], y_train_full[-5000:]

### Basic CNN Structure

In [19]:
## Used on Fashion MNIST
from functools import partial

DefaultConv2d = partial(tf.keras.layers.Conv2D, kernel_size=3, padding='same',
                       activation='relu', kernel_initializer='he_normal')
model = tf.keras.Sequential([
    ### This first conv layer should capture lower level features so less filters
    DefaultConv2d(filters=64, kernel_size=7, input_shape=[28,28,1]),
    tf.keras.layers.MaxPool2D(),
    DefaultConv2d(filters=128),
    DefaultConv2d(filters=128),
    tf.keras.layers.MaxPool2D(),
    ### This conv layers should capture the high level features so have more filters
    DefaultConv2d(filters=256),
    DefaultConv2d(filters=256),
    tf.keras.layers.MaxPool2D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu', 
                          kernel_initializer='he_normal'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu', 
                          kernel_initializer='he_normal'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(10, activation='softmax')
])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [20]:
model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy,
             optimizer='nadam',
             metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_valid, y_valid))

Epoch 1/10


I0000 00:00:1736718792.413770    4050 service.cc:148] XLA service 0x7f9e0c004d10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1736718792.413897    4050 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2025-01-12 15:53:12.482007: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.


[1m  33/1719[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m8s[0m 5ms/step - accuracy: 0.1556 - loss: 2.6460    

I0000 00:00:1736718795.088315    4050 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 7ms/step - accuracy: 0.6032 - loss: 1.1305 - val_accuracy: 0.8650 - val_loss: 0.3915
Epoch 2/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.8477 - loss: 0.4480 - val_accuracy: 0.8834 - val_loss: 0.3058
Epoch 3/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.8781 - loss: 0.3611 - val_accuracy: 0.8944 - val_loss: 0.3026
Epoch 4/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.8970 - loss: 0.3063 - val_accuracy: 0.9036 - val_loss: 0.2577
Epoch 5/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9002 - loss: 0.2906 - val_accuracy: 0.9058 - val_loss: 0.2568
Epoch 6/10
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9093 - loss: 0.2643 - val_accuracy: 0.9124 - val_loss: 0.2468
Epoch 7/10
[1m1719/1719[

### List of other Architectures
- LeNet-5
- AlexNet
- GoogLeNet
- VGGNet
- ResNet
- Xception(A variant of GoogLeNet)
- SENet
- ResNeXt
- DenseNet
- MobileNet
- CSPNet
- EfficientNet

## Implementing a ResNet-34 CNN

In [21]:
DefaultConv2D = partial(tf.keras.layers.Conv2D, kernel_size=3, strides=1,
                       padding='same', kernel_initializer='he_normal',
                       use_bias=False)

class ResidualUnit(tf.keras.layers.Layer):
    def __init__(self, filters, strides=1, activation='relu', **kwargs):
        super().__init__(**kwargs)
        self.activation = tf.keras.activations.get(activation)
        self.main_layers = [
            DefaultConv2D(filters, strides=strides),
            tf.keras.layers.BatchNormalization(),
            self.activation,
            DefaultConv2D(filters),
            tf.keras.layers.BatchNormalization()
        ]
        self.skip_layers = []
        if strides > 1:
            self.skip_layers = [
                DefaultConv2D(filters, kernel_size=1, strides=strides),
                tf.keras.layers.BatchNormalization()
            ]

    def call(self, inputs):
        Z = inputs 
        for layer in self.main_layers:
            Z = layer(Z)
        skip_Z = inputs
        for layer in self.skip_layers:
            skip_Z = layer(skip_Z)
        return self.activation(Z + skip_Z)

In [25]:
res_net_34 = tf.keras.Sequential([
    DefaultConv2D(64, kernel_size=7, strides=2, input_shape=[224, 224, 3]),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.MaxPool2D(pool_size=3, strides=2, padding='same')
])
prev_filters = 64
for filters in [64] * 3 + [128] * 4 + [256] * 6 + [512] * 3:
    strides = 1 if filters == prev_filters else 2
    res_net_34.add(ResidualUnit(filters, strides=strides))
    prev_filters = filters
    
res_net_34.add(tf.keras.layers.GlobalAvgPool2D())
res_net_34.add(tf.keras.layers.Flatten())
res_net_34.add(tf.keras.layers.Dense(10, activation='softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


## Using Pretrained Models from Keras

In [33]:
model = tf.keras.applications.ResNet50(weights='imagenet')

In [34]:
images = load_sample_images()['images']
images_resized = tf.keras.layers.Resizing(height=224, width=224, crop_to_aspect_ratio=True)(images)

In [35]:
inputs = tf.keras.applications.resnet50.preprocess_input(images_resized)

In [36]:
y_proba = model.predict(inputs)
y_proba.shape

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


(2, 1000)

In [37]:
top_K = tf.keras.applications.resnet50.decode_predictions(y_proba, top=4)
for image_index in range(len(images)):
    print(f'Image #{image_index}')
    for class_id, name, y_proba in top_K[image_index]:
        print(f'  {class_id} - {name:12s} {y_proba:.2%}')

Image #0
  n03598930 - jigsaw_puzzle 30.61%
  n02782093 - balloon      17.17%
  n03888257 - parachute    5.58%
  n06359193 - web_site     3.83%
Image #1
  n04209133 - shower_cap   34.29%
  n09229709 - bubble       11.40%
  n02782093 - balloon      9.46%
  n07745940 - strawberry   4.94%


## Pretrained Models for Transfer Learning