# [Tensorflow profiler](https://github.com/tensorflow/profiler)

- [Github repository](https://github.com/tensorflow/profiler)
- [Performance Guide](https://www.tensorflow.org/guide/profiler)
- [Mixed precision guide](https://www.tensorflow.org/guide/keras/mixed_precision)
- [Grappler optimization](https://www.tensorflow.org/guide/graph_optimization)

[Permission issue with Performance Counters](https://developer.nvidia.com/nvidia-development-tools-solutions-ERR_NVGPUCTRPERM-permission-issue-performance-counters)

- `echo 'options nvidia "NVreg_RestrictProfilingToAdminUsers=0"' | sudo tee -a /etc/modprobe.d/nvidia.conf`

In [None]:
!pip uninstall -q -y tensorboard tensorflow
!pip install -q -U tf-nightly tb-nightly tensorboard_plugin_profile

In [None]:
import os
import time
from typing import Dict, Tuple

import tensorflow as tf
import tensorflow_datasets as tfds

from datetime import datetime
from tensorflow.keras.mixed_precision import experimental as mixed_precision

tfds.disable_progress_bar()
%load_ext nb_black
%load_ext tensorboard

In [None]:
def preprocess_fn(
    feature: Dict[str, tf.Tensor], output_height: int = 28, output_width: int = 28
) -> Dict[str, tf.Tensor]:
    image, label = feature["image"], feature["label"]
    image = tf.cast(image, tf.float32)
    label = tf.cast(label, tf.int64)
    image = tf.image.resize_with_crop_or_pad(image, output_width, output_height)
    image = tf.math.subtract(image, 128.0)
    image = tf.math.divide(image, 128.0)
    return {"image": image, "label": label}


def read_data(name: str) -> Tuple[tf.data.Dataset, tf.data.Dataset]:
    ds_train, ds_test = tfds.load(
        name=name, split=["train", "test"], data_dir="/data/tfds"
    )
    ds_train = (
        ds_train.shuffle(1000)
        .map(preprocess_fn, num_parallel_calls=8)
        .batch(128)
        .prefetch(tf.data.experimental.AUTOTUNE)
    )
    ds_test = (
        ds_test.map(preprocess_fn, num_parallel_calls=8)
        .batch(128)
        .prefetch(tf.data.experimental.AUTOTUNE)
    )
    return (ds_train, ds_test)


def mnist_model(
    num_classes: int = 10,
    weight_decay: float = 0.0,
    prob: float = 0.5,
    input_shape: Tuple[int, int, int] = (28, 28, 1),
) -> tf.keras.Sequential:

    weights_init = tf.keras.initializers.TruncatedNormal(stddev=0.1)
    model = tf.keras.Sequential(
        [
            tf.keras.layers.Conv2D(
                filters=32,
                kernel_size=5,
                activation="relu",
                kernel_initializer=weights_init,
                kernel_regularizer=tf.keras.regularizers.l2(weight_decay),
                input_shape=input_shape,
            ),
            tf.keras.layers.MaxPool2D(2, 2),
            tf.keras.layers.Conv2D(
                filters=64,
                kernel_size=5,
                activation="relu",
                kernel_initializer=weights_init,
                kernel_regularizer=tf.keras.regularizers.l2(weight_decay),
            ),
            tf.keras.layers.MaxPool2D(2, 2),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(
                1024,
                activation="relu",
                kernel_initializer=weights_init,
                kernel_regularizer=tf.keras.regularizers.l2(weight_decay),
            ),
            tf.keras.layers.Dropout(prob),
            tf.keras.layers.Dense(num_classes, kernel_initializer=weights_init),
            tf.keras.layers.Activation("linear", dtype="float32"),

        ]
    )
    return model

In [None]:
policy = mixed_precision.Policy("mixed_float16")
mixed_precision.set_policy(policy)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
optimizer = mixed_precision.LossScaleOptimizer(optimizer, loss_scale="dynamic")

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# tf.config.optimizer.set_experimental_options(
#     {"layout_optimizer": True, "shape_optimization": True}
# )
# print(tf.config.optimizer.get_experimental_options())

ds_train, ds_test = read_data(name="mnist")
model = mnist_model()

In [None]:
# @tf.function(experimental_compile=True)
@tf.function
def train_step(featues):
    images, labels = features["image"], features["label"]
    with tf.GradientTape() as tape:
        logits = model(images)
        loss_value = loss(labels, logits)
        scaled_loss = optimizer.get_scaled_loss(loss_value)
    scaled_gradients = tape.gradient(scaled_loss, model.trainable_variables)
    gradients = optimizer.get_unscaled_gradients(scaled_gradients)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss_value

In [None]:
ds_train_ = ds_train.take(60)
# Warm-up
for idx, features in enumerate(ds_train_.take(10)):
    _ = train_step(features)

tf.profiler.experimental.start("./mnist_logs")
start_time = time.perf_counter()
for idx, features in enumerate(ds_train_.take(50)):
    train_loss = train_step(features)
tf.profiler.experimental.stop()
print(f"Loss: {train_loss},\t{(time.perf_counter() - start_time) / 50:.5f} sec/step")

In [None]:
%tensorboard --logdir ./mnist_logs --bind_all