In [0]:
import os
import subprocess

from pathlib import Path
  
def run_training():
  training = '/tmp/training.sh'
  with open(training, 'w') as f:
    f.write("""#!/bin/bash
    # BERT-Large Training
    # Install necessary package
    sudo apt-get update
    sudo apt-get install zip -y
    sudo apt-get -y install git
    sudo apt-get install -y libblacs-mpi-dev
    sudo apt-get install -y numactl
    
    # Remove old materials if exist
    rm -rf /TF/
    mkdir /TF/
    # Create ckpt directory
    mkdir -p /TF/BERT-Large-output/
    # Download IntelAI benchmark
    cd /TF/
    wget https://github.com/IntelAI/models/archive/refs/tags/v1.8.1.zip
    unzip v1.8.1.zip
    
    cores_per_socket=$(lscpu | awk '/^Core\(s\) per socket/{ print $4 }')
    numa_nodes=$(lscpu | awk '/^NUMA node\(s\)/{ print $3 }')
    export SQUAD_DIR=/dbfs/home/TF/bert-large/SQuAD-1.1
    export BERT_LARGE_MODEL=/dbfs/home/TF/bert-large/wwm_uncased_L-24_H-1024_A-16
    export BERT_LARGE_OUTPUT=/TF/BERT-Large-output/
    export PYTHONPATH=$PYTHONPATH:.
    
    function run_training_without_numabind() {
     python launch_benchmark.py \
        --model-name=bert_large \
        --precision=fp32 \
        --mode=training \
        --framework=tensorflow \
        --batch-size=4 \
        --benchmark-only \
        --data-location=$BERT_LARGE_MODEL \
        -- train-option=SQuAD  DEBIAN_FRONTEND=noninteractive   config_file=$BERT_LARGE_MODEL/bert_config.json   init_checkpoint=$BERT_LARGE_MODEL/bert_model.ckpt     vocab_file=$BERT_LARGE_MODEL/vocab.txt train_file=$SQUAD_DIR/train-v1.1.json     predict_file=$SQUAD_DIR/dev-v1.1.json      do-train=True learning-rate=1.5e-5   max-seq-length=384     do_predict=True warmup-steps=0     num_train_epochs=0.1     doc_stride=128      do_lower_case=False     experimental-gelu=False     mpi_workers_sync_gradients=True
    }

    function run_training_with_numabind() {
      intra_thread=`expr $cores_per_socket - 2`
      python launch_benchmark.py \
        --model-name=bert_large \
        --precision=fp32 \
        --mode=training \
        --framework=tensorflow \
        --batch-size=4 \
        --mpi_num_processes=$numa_nodes \
        --num-intra-threads=$intra_thread \
        --num-inter-threads=1 \
        --benchmark-only \
        --data-location=$BERT_LARGE_MODEL \
        -- train-option=SQuAD  DEBIAN_FRONTEND=noninteractive   config_file=$BERT_LARGE_MODEL/bert_config.json init_checkpoint=$BERT_LARGE_MODEL/bert_model.ckpt     vocab_file=$BERT_LARGE_MODEL/vocab.txt train_file=$SQUAD_DIR/train-v1.1.json     predict_file=$SQUAD_DIR/dev-v1.1.json      do-train=True learning-rate=1.5e-5   max-seq-length=384     do_predict=True warmup-steps=0     num_train_epochs=0.1     doc_stride=128      do_lower_case=False     experimental-gelu=False     mpi_workers_sync_gradients=True
    }
    
    # Launch Benchmark for training
    cd /TF/models-1.8.1/benchmarks/
    
    if [ "$numa_nodes" = "1" ];then
            run_training_without_numabind
    else
            run_training_with_numabind
    fi """)
    
  os.chmod(training, 555)
  p = subprocess.Popen([training], stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  directory_to_second_numa_info = Path("/sys/devices/system/node/node1")
  
  if  directory_to_second_numa_info.exists():
    # 2 NUMA nodes
    for line in iter(p.stdout.readline, ''):
      if b"Reading package lists..." in line or b"answer: [UNK] 1848" in line:
        print("\t\t\t\t  Preparing data ......", end='\r')
      if b"INFO:tensorflow:examples/sec" in line:
        print("\t\t\t\t  Training started, current real-time throughput (examples/sec) : " + str(float(str(line).strip("\\n'").split(' ')[1])*2), end='\r')
      if line == b'' and p.poll() != None:
        break
  else:
    # 1 NUMA node
    for line in iter(p.stdout.readline, ''):
      if b"Reading package lists..." in line or b"answer: [UNK] 1848" in line:
        print("\t\t\t\t  Preparing data ......", end='\r')
      if b"INFO:tensorflow:examples/sec" in line:
        print("\t\t\t\t  Training started, current real-time throughput (examples/sec) : " + str(line).strip("\\n'").split(' ')[1], end='\r')
      if line == b'' and p.poll() != None:
        break
        
  p.stdout.close()

run_training()

In [0]:
# Print version, and check whether is intel-optimized
import tensorflow
print("tensorflow version: " + tensorflow.__version__)

from packaging import version
if (version.parse("2.5.0") <= version.parse(tensorflow.__version__)):
  from tensorflow.python.util import _pywrap_util_port
  print( _pywrap_util_port.IsMklEnabled())
else:
  from tensorflow.python import _pywrap_util_port
  print(_pywrap_util_port.IsMklEnabled())