# eager vs graph running time 

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
# To start eager execution (this must be top of code)
tf.executing_eagerly()
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus: tf.config.experimental.set_memory_growth(gpus[0], True)


## time measurement
In this section, we measure a calculation time.

$$
f({\bf x}) = {\bf W_3W_2W_1x}
$$


In [3]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(1000),
    tf.keras.layers.Dense(1000),
    tf.keras.layers.Dense(1),
])

In [5]:
# batch_size is 1024.
x = tf.random.normal([1024, 1000])
y = tf.random.normal([1024, 1])

def loss(y, y_pre):
    return tf.losses.mean_squared_error(y, y_pre)

# initialize optimizer
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)

## Eager Execution

In [6]:
def measurement(gpu=False):
    if gpu:
        device = "/gpu:0"
    else:
        device = "/cpu:0" 
        
    with tf.device(device):
        with tf.GradientTape() as tape:
            y_pre = model(x)
            loss_value = loss(y, y_pre)
        grads = tape.gradient(loss_value, model.variables)
        optimizer.apply_gradients(zip(grads, model.variables))

In [9]:
%%timeit
measurement(False)

38.8 ms ± 324 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [10]:
%%timeit
measurement(True)

15.7 ms ± 70.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## graph

In [12]:
@tf.function
def graph_measurement(gpu=False):
    if gpu:
        device = "/gpu:0"
    else:
        device = "/cpu:0" 
        
    with tf.device(device):
        with tf.GradientTape() as tape:
            y_pre = model(x)
            loss_value = loss(y, y_pre)
        grads = tape.gradient(loss_value, model.variables)
        optimizer.apply_gradients(zip(grads, model.variables))

In [14]:
%%timeit
graph_measurement(False)

28.5 ms ± 1.16 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%%timeit
graph_measurement(True)

5.52 ms ± 48.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## for loop Eager

In [16]:
def measurement_forloop(gpu=False):
    if gpu:
        device = "/gpu:0"
    else:
        device = "/cpu:0" 
    for _ in range(10):
        with tf.device(device):
            with tf.GradientTape() as tape:
                y_pre = model(x)
                loss_value = loss(y, y_pre)
            grads = tape.gradient(loss_value, model.variables)
            optimizer.apply_gradients(zip(grads, model.variables))

In [17]:
%%timeit
measurement_forloop(False)

401 ms ± 12.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
%%timeit
measurement_forloop(True)

149 ms ± 1.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## for loop Graph

In [21]:
@tf.function
def graph_measurement_forloop(gpu=False):
    if gpu:
        device = "/gpu:0"
    else:
        device = "/cpu:0"
    with tf.device(device):
        for _ in range(10):
            with tf.GradientTape() as tape:
                y_pre = model(x)
                loss_value = loss(y, y_pre)
            grads = tape.gradient(loss_value, model.variables)
            optimizer.apply_gradients(zip(grads, model.variables))

In [22]:
%%timeit
graph_measurement_forloop(False)

279 ms ± 6.21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
%%timeit
graph_measurement_forloop(True)

47.8 ms ± 2.69 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## PyTorch part

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
model = nn.Sequential(
    nn.Linear(1000, 1000),
    nn.Linear(1000, 1000),
    nn.Linear(1000, 1),
)

In [3]:
x = torch.randn(1024, 1000)
y = torch.randn(1024, 1)

loss = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

## for loop Eager

In [5]:
def measurement_forloop(gpu=False):
    if gpu:
        device = "cuda"
    else:
        device = "cpu" 
        
    model.to(device)
    
    for _ in range(10):
        optimizer.zero_grad() 
        y_pre = model(x.to(device))
        loss_value = loss(y_pre, y.to(device))
        loss_value.backward()
        optimizer.step()

In [6]:
%%timeit
measurement_forloop(False)

427 ms ± 4.83 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%%timeit
measurement_forloop(True)

44 ms ± 3.38 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
### for loop script (graph)

In [9]:
class Model(torch.jit.ScriptModule):
    
    def __init__(self):
        super(Model, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(1000, 1000),
            nn.Linear(1000, 1000),
            nn.Linear(1000, 1),
        ).to('cuda')

    @torch.jit.script_method
    def forward(self, x):
        return self.model(x)

model = Model()

In [10]:
x = x.to('cuda')
y = y.to('cuda')
def measurement_forloop_script():
    for _ in range(10):
        optimizer.zero_grad() 
        y_pre = model(x)
        loss_value = loss(y_pre, y)
        loss_value.backward()
        optimizer.step()

In [11]:
%%timeit
measurement_forloop_script()

35.5 ms ± 848 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
