# Tensorflow With DirectML

```
mkdir jupyter-tensordml
cd jupyter-tensordml
conda create -n tensordml python=3.10 -y
conda activate tensordml
pip install tensorflow-cpu==2.10
pip install tensorflow-directml-plugin
pip install jupyterlab
conda install "numpy<2" -y
jupyter lab
```

### Pip list

```
(tensordml) C:\Users\phill\jupyter-tensordml>pip list
Package                      Version
---------------------------- ---------------
absl-py                      2.1.0
anyio                        4.9.0
argon2-cffi                  23.1.0
argon2-cffi-bindings         21.2.0
arrow                        1.3.0
asttokens                    3.0.0
astunparse                   1.6.3
async-lru                    2.0.5
attrs                        25.3.0
babel                        2.17.0
beautifulsoup4               4.13.3
bleach                       6.2.0
cachetools                   5.5.2
certifi                      2025.1.31
cffi                         1.17.1
charset-normalizer           3.4.1
colorama                     0.4.6
comm                         0.2.2
debugpy                      1.8.13
decorator                    5.2.1
defusedxml                   0.7.1
exceptiongroup               1.2.2
executing                    2.2.0
fastjsonschema               2.21.1
flatbuffers                  25.2.10
fqdn                         1.5.1
gast                         0.4.0
google-auth                  2.38.0
google-auth-oauthlib         0.4.6
google-pasta                 0.2.0
grpcio                       1.71.0
h11                          0.14.0
h5py                         3.13.0
httpcore                     1.0.7
httpx                        0.28.1
idna                         3.10
ipykernel                    6.29.5
ipython                      8.34.0
isoduration                  20.11.0
jedi                         0.19.2
Jinja2                       3.1.6
json5                        0.10.0
jsonpointer                  3.0.0
jsonschema                   4.23.0
jsonschema-specifications    2024.10.1
jupyter_client               8.6.3
jupyter_core                 5.7.2
jupyter-events               0.12.0
jupyter-lsp                  2.2.5
jupyter_server               2.15.0
jupyter_server_terminals     0.5.3
jupyterlab                   4.3.6
jupyterlab_pygments          0.3.0
jupyterlab_server            2.27.3
keras                        2.10.0
Keras-Preprocessing          1.1.2
libclang                     18.1.1
Markdown                     3.7
MarkupSafe                   3.0.2
matplotlib-inline            0.1.7
mistune                      3.1.3
mkl_fft                      1.3.11
mkl_random                   1.2.8
mkl-service                  2.4.0
nbclient                     0.10.2
nbconvert                    7.16.6
nbformat                     5.10.4
nest-asyncio                 1.6.0
notebook_shim                0.2.4
numpy                        1.26.4
oauthlib                     3.2.2
opt_einsum                   3.4.0
overrides                    7.7.0
packaging                    24.2
pandocfilters                1.5.1
parso                        0.8.4
pip                          25.0
platformdirs                 4.3.7
prometheus_client            0.21.1
prompt_toolkit               3.0.50
protobuf                     3.19.6
psutil                       7.0.0
pure_eval                    0.2.3
pyasn1                       0.6.1
pyasn1_modules               0.4.1
pycparser                    2.22
Pygments                     2.19.1
python-dateutil              2.9.0.post0
python-json-logger           3.3.0
pywin32                      310
pywinpty                     2.0.15
PyYAML                       6.0.2
pyzmq                        26.3.0
referencing                  0.36.2
requests                     2.32.3
requests-oauthlib            2.0.0
rfc3339-validator            0.1.4
rfc3986-validator            0.1.1
rpds-py                      0.23.1
rsa                          4.9
Send2Trash                   1.8.3
setuptools                   75.8.0
six                          1.17.0
sniffio                      1.3.1
soupsieve                    2.6
stack-data                   0.6.3
tensorboard                  2.10.1
tensorboard-data-server      0.6.1
tensorboard-plugin-wit       1.8.1
tensorflow-cpu               2.10.0
tensorflow-directml-plugin   0.4.0.dev230202
tensorflow-estimator         2.10.0
tensorflow_intel             2.10.0
tensorflow-io-gcs-filesystem 0.31.0
termcolor                    2.5.0
terminado                    0.18.1
tinycss2                     1.4.0
tomli                        2.2.1
tornado                      6.4.2
traitlets                    5.14.3
types-python-dateutil        2.9.0.20241206
typing_extensions            4.12.2
uri-template                 1.3.0
urllib3                      2.3.0
wcwidth                      0.2.13
webcolors                    24.11.1
webencodings                 0.5.1
websocket-client             1.8.0
Werkzeug                     3.1.3
wheel                        0.45.1
wrapt                        1.17.2
```

#### Can Tensorflow see the DirectML resource?

In [None]:
import tensorflow as tf

# Enable device placement logging to see where operations are assigned
tf.debugging.set_log_device_placement(True)

# List all physical devices TensorFlow can see
physical_devices = tf.config.list_physical_devices()
print("Available physical devices:")
for device in physical_devices:
    print(device)

# Specifically check for GPU devices
gpu_devices = tf.config.list_physical_devices('GPU')
if gpu_devices:
    print("\nGPU devices found:")
    for gpu in gpu_devices:
        print(gpu)
else:
    print("\nNo GPU devices found.")

# Test a simple operation to see if it runs on the GPU
with tf.device('/GPU:0'):  # Explicitly place on the first GPU
    a = tf.constant([[1.0, 2.0]])
    b = tf.constant([[3.0], [4.0]])
    c = tf.matmul(a, b)
    print("\nResult of matrix multiplication on GPU:")
    print(c)

#### Check CPU Flags

In [None]:
import cpuinfo
info = cpuinfo.get_cpu_info()
print("CPU:", info['brand_raw'])
print("Flags:", info['flags'])  # Look for 'avx512f' (foundation) and others like 'avx512dq', 'avx512bw'

#### Training FP32 9950X3D vs RX 9070 XT

This is my first real dip into TensorFlow, and also my second attention at training a model with DirectML. So far, TF is much more compliant with DirectML. There are limitations with DirectML and the version of TF (2.10) that I am running. We cannot utilize mixed precision so we are limited by the FP32 dtype. 

- RX 9070 XT
    - Test accuracy: 0.9851
    - Elapsed time: 3.16 seconds

- 9950X3D AVX2 (90% CPU Utilization)
    - Test accuracy: 0.9843
    - Elapsed time: 35.09 seconds

- 9950X3D AVX512 (65% CPU Utilization)
    - Test accuracy: 0.9854
    - Elapsed time: 30.40 seconds

#### <font color="red">Restart The Python Kernel When Flipping Between Devices</font>

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
import time
import os

# Configuration: Set to 'CPU' or 'GPU'
DEVICE = 'GPU'  # For AVX-512 testing

# Optimize for CPU with oneDNN and threading
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '1'  # Enable oneDNN for AVX-512
os.environ['OMP_NUM_THREADS'] = '16'  # Match physical cores (16 for 9950X3D)
os.environ['KMP_AFFINITY'] = 'granularity=fine,compact,1,0'  # Pin threads to cores
tf.config.threading.set_intra_op_parallelism_threads(16)  # One thread per core
tf.config.threading.set_inter_op_parallelism_threads(2)  # Minimal inter-op parallelism
tf.debugging.set_log_device_placement(False)  # Set True to verify AVX-512 ops
tf.config.optimizer.set_jit(True)  # Enable XLA

# Load and preprocess the MNIST dataset
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
x_train = np.expand_dims(x_train, axis=-1)
x_test = np.expand_dims(x_test, axis=-1)

# Convert labels to categorical
y_train = tf.keras.utils.to_categorical(y_train, 10)
y_test = tf.keras.utils.to_categorical(y_test, 10)

# Create datasets explicitly
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(512).prefetch(tf.data.AUTOTUNE)
val_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(512).prefetch(tf.data.AUTOTUNE)

# Define the model
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Print model summary
model.summary()

# Set device based on configuration
device_name = '/CPU:0' if DEVICE == 'CPU' else '/GPU:0'
with tf.device(device_name):
    start_time = time.perf_counter()
    history = model.fit(train_dataset,
                        epochs=5,
                        validation_data=val_dataset)
    test_loss, test_acc = model.evaluate(val_dataset)
    end_time = time.perf_counter()

elapsed_time = end_time - start_time
print(f"\nTest accuracy: {test_acc:.4f}\nElapsed time: {elapsed_time:.2f} seconds")