# Basics of Device Placements in TensorFlow2 

Make sure to select GPU runtime type (Runtime -> Change runtime type), before running this notebook

In this notebook we will compare performance of execution on GPU vs CPU for matrix math and model compile and train

In [1]:
import tensorflow as tf
import numpy as np

import time

from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras import layers

print(tf.__version__)

2.8.2


In [2]:
#List all physical devices. Lets write a function for this

def fn_devicetypes(str_device_type=None):
  list_device_types = tf.config.list_physical_devices(device_type=str_device_type)
  return list_device_types

In [3]:
print(fn_devicetypes())
print(fn_devicetypes("GPU"))
print(fn_devicetypes("CPU"))

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


In [4]:
#We can get the GPU devicename using the tf.test.gpu_device_name()

tf.test.gpu_device_name()

'/device:GPU:0'

In [5]:
#
# TensorFlow automatically allocate Tensor operations to a physical device 
# and will handle the copying between CPU and GPU memory

# Let's define a random Tensor and check the allocated device

tensor_x = tf.random.uniform([3,3])
tensor_x.device

'/job:localhost/replica:0/task:0/device:GPU:0'

In [6]:
# The output string above will end with 'GPU:K' if the said Tensor is placed on the K-th GPU device.
# We can also check if a tensor is placed on a specific devise by using the device_endswith which 
# returns either a True or a False

print(tensor_x.device.endswith('CPU:0'))
print(tensor_x.device.endswith('GPU:0'))
print(tensor_x.device.endswith('GPU:1'))

False
True
False


#### Matrix Addition and Multiplication Performance on GPU vs CPU

In [7]:
# We can specify the device placement for a tensor if the device is available. 
# There are benefits to running tensor operations on the GPU and we will observe
# this by performing matrix operations on the said tensor on the GPU and CPU
# and compare them


def matrix_addition(tensor_x, str_device_type):
  with tf.device(str_device_type):
    start = time.time()
    for step in range(10):
      tf.add(tensor_x, tensor_x)
    time_elapsed = time.time() - start
    print("Elapsed Time for Matrix Addition on the " + str_device_type + "{:0.2f}ms".format(1000*time_elapsed))


def matrix_multiply(tensor_x, str_device_type):
  with tf.device(str_device_type):
    start = time.time()
    for step in range(10):
      tf.multiply(tensor_x, tensor_x)
    time_elapsed = time.time() - start
    print("Elapsed Time for Matrix Multiplication on the " + str_device_type + "{:0.2f}ms".format(1000*time_elapsed))


In [8]:
# Creater a random tensor
def init_tensor():
  tensor_x = tf.random.uniform([3,3])
  return tensor_x

In [11]:

matrix_addition(init_tensor(), "CPU:0")
matrix_addition(init_tensor(), "GPU:0")
print("\n")
matrix_multiply(init_tensor(), "CPU:0")
matrix_multiply(init_tensor(), "GPU:0")

Elapsed Time for Matrix Addition on the CPU:01.19ms
Elapsed Time for Matrix Addition on the GPU:00.67ms


Elapsed Time for Matrix Multiplication on the CPU:05.66ms
Elapsed Time for Matrix Multiplication on the GPU:00.51ms


#### Compiling and Training Performance on GPU vs CPU

In [12]:
# This difference in time becomes more evident when we are actually training a model. 
# Lets try that out a simple image classification model using the MNIST database

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, y_train = x_train[:1000], y_train[:1000]
x_train, x_test = x_train/255., x_test/255.

In [13]:
# Now lets build the simple sequential model with a few Conv, maxpooling layers for the sake of this exercise

# Let's define our get_model function

def get_model():
  model = Sequential([
      layers.Conv2D(32, (3,3), activation = 'relu', padding = 'same', input_shape = (28,28,1)),
      layers.MaxPool2D((2,2)),
      layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
      layers.MaxPooling2D((2, 2)),
      layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
      layers.MaxPooling2D((2, 2)),
      layers.Flatten(),
      layers.Dense(64, activation = 'relu'),
      layers.Dense(10, activation = 'softmax')
  ])
  return model

In [14]:
# Let's define our compile and fit function that takes the device name as a parameter

def compile_fit_model(str_device_type):
  with tf.device(str_device_type):
    model = get_model()
    model.compile(optimizer = RMSprop(1e-3), loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
    start = time.time()
    model.fit(x_train[...,np.newaxis], y_train, epochs = 6, verbose = 0)
    elapsed_time = time.time() - start
    print("Elapsed Time for compile and fit of the model on the " + str_device_type + "{:0.2f}ms".format(1000*elapsed_time))


In [15]:
# Call compile_fit_model for GPU device

compile_fit_model("CPU:0")

Elapsed Time for compile and fit of the model on the CPU:08735.02ms


In [16]:
# Call compile_fit_model for CPU device

compile_fit_model("GPU:0")

Elapsed Time for compile and fit of the model on the GPU:05700.47ms
