Tensorflow model:

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Load CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()

# Data augmentation for training
train_datagen = ImageDataGenerator(
    rescale=1.0/255,
    horizontal_flip=True,
    preprocessing_function=lambda img: tf.image.random_crop(img, size=[32, 32, 3])
)

# Only normalization for testing
test_datagen = ImageDataGenerator(rescale=1.0/255)

# Create data generators
train_generator = train_datagen.flow(x_train, y_train, batch_size=32)
test_generator = test_datagen.flow(x_test, y_test, batch_size=32)

In [2]:
import tensorflow as tf
from tensorflow.keras import layers

class ResidualBlock(tf.keras.Model):
    def __init__(self, filters, kernel_size=3, stride=1):
        super(ResidualBlock, self).__init__()
        self.conv1 = layers.Conv2D(filters, kernel_size, strides=stride, padding='same', use_bias=False)
        self.bn1 = layers.BatchNormalization()
        self.relu = layers.ReLU()
        self.conv2 = layers.Conv2D(filters, kernel_size, strides=1, padding='same', use_bias=False)
        self.bn2 = layers.BatchNormalization()
        
        # Adjust shortcut if stride > 1
        if stride != 1:
            self.shortcut = layers.Conv2D(filters, kernel_size=1, strides=stride, use_bias=False)
            self.shortcut_bn = layers.BatchNormalization()
        else:
            self.shortcut = lambda x: x  # Identity shortcut
        
    def call(self, x):
        shortcut = self.shortcut(x)
        if hasattr(self, 'shortcut_bn'):  
            shortcut = self.shortcut_bn(shortcut)
        
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        
        x = layers.Add()([x, shortcut])
        return self.relu(x)

class ResNet6(tf.keras.Model):
    def __init__(self, num_classes=10):
        super(ResNet6, self).__init__()
        self.conv = layers.Conv2D(16, 3, strides=1, padding='same', use_bias=False)
        self.bn = layers.BatchNormalization()
        self.relu = layers.ReLU()
        self.rb1 = ResidualBlock(16)
        self.rb2 = ResidualBlock(16)
        self.gap = layers.GlobalAveragePooling2D()
        self.fc = layers.Dense(num_classes)

    def call(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        x = self.rb1(x)
        x = self.rb2(x)
        x = self.gap(x)
        return self.fc(x)



In [3]:
# Build model
model_gpu = ResNet6()
model_gpu.build(input_shape=(None, 32, 32, 3))

model_cpu = ResNet6()
model_cpu.build(input_shape=(None, 32, 32, 3))
model_cpu.summary()

Model: "res_net6_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_5 (Conv2D)           multiple                  432       
                                                                 
 batch_normalization_5 (Batc  multiple                 64        
 hNormalization)                                                 
                                                                 
 re_lu_3 (ReLU)              multiple                  0         
                                                                 
 residual_block_2 (ResidualB  multiple                 4736      
 lock)                                                           
                                                                 
 residual_block_3 (ResidualB  multiple                 4736      
 lock)                                                           
                                                        

In [4]:
import numpy as np
np.random.seed(42)
tf.random.set_seed(42)
tf.keras.mixed_precision.set_global_policy('float32')


In [6]:
# show tensor devices
for tensor in model_gpu.trainable_variables:
    print(tensor.device)

/job:localhost/replica:0/task:0/device:GPU:0
/job:localhost/replica:0/task:0/device:GPU:0
/job:localhost/replica:0/task:0/device:GPU:0
/job:localhost/replica:0/task:0/device:GPU:0
/job:localhost/replica:0/task:0/device:GPU:0
/job:localhost/replica:0/task:0/device:GPU:0
/job:localhost/replica:0/task:0/device:GPU:0
/job:localhost/replica:0/task:0/device:GPU:0
/job:localhost/replica:0/task:0/device:GPU:0
/job:localhost/replica:0/task:0/device:GPU:0
/job:localhost/replica:0/task:0/device:GPU:0
/job:localhost/replica:0/task:0/device:GPU:0
/job:localhost/replica:0/task:0/device:GPU:0
/job:localhost/replica:0/task:0/device:GPU:0
/job:localhost/replica:0/task:0/device:GPU:0
/job:localhost/replica:0/task:0/device:GPU:0
/job:localhost/replica:0/task:0/device:GPU:0


In [7]:
import os
import time

with tf.device('/GPU:0'):
    gpu_start = time.time()
    model_gpu.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model_gpu.fit(train_generator, epochs=1, validation_data=test_generator)
    gpu_end = time.time()
    print(f"GPU time: {gpu_end - gpu_start} seconds")

with tf.device('/CPU:0'):
    cpu_start = time.time()
    model_cpu.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model_cpu.fit(train_generator, epochs=1, validation_data=test_generator)
    cpu_end = time.time()
    print(f"CPU time: {cpu_end - cpu_start} seconds")

GPU time: 232.0684311389923 seconds
CPU time: 299.1338429450989 seconds


 model (ResNet6) is relatively small. For small models, the overhead of transferring data to the GPU and managing GPU resources can sometimes outweigh the benefits of parallel computation. As a result, the GPU might not provide a significant speedup compared to the CPU.