## Run BERT-Large inference workload

In the beginning, data preprocessing will take some minutes. Once the preprocessing is done, the inference workload can output throughput performance number in real time. It takes around 30 minutes to complete the entire inference process on Standard_F32s_v2 instance. The precise elapsed time depends on instance type and whether is Intel-optimized TensorFlow. 

**Note:** ***If you click "Stop Execution" for running training/inference cells, and then run training/inference again immediately. You may see lower performance number, because another training/inference is still on-going.***

In [0]:
import os
import subprocess

from pathlib import Path
  
def run_inference():
  inference = '/tmp/inference.sh'
  with open(inference, 'w') as f:
    f.write("""#!/bin/bash
    # BERT-Large Inference
    # Install necessary package
    sudo apt-get update
    sudo apt-get install zip -y
    sudo apt-get -y install git
    sudo apt-get install -y numactl
    # Remove old materials if exist
    rm -rf /TF/
    mkdir /TF/
    # Create ckpt directory
    mkdir -p /TF/BERT-Large-output/
    export BERT_LARGE_OUTPUT=/TF/BERT-Large-output
    # Download IntelAI benchmark
    cd /TF/
    wget https://github.com/IntelAI/models/archive/refs/tags/v1.8.1.zip
    unzip v1.8.1.zip
    cd /TF/models-1.8.1/
    wget https://github.com/oap-project/oap-tools/raw/master/integrations/ml/databricks/benchmark/IntelAI_models_bertlarge_inference_realtime_throughput.patch
    git apply IntelAI_models_bertlarge_inference_realtime_throughput.patch

    export SQUAD_DIR=/dbfs/home/TF/bert-large/SQuAD-1.1/
    export BERT_LARGE_DIR=/dbfs/home/TF/bert-large/
    export PYTHONPATH=$PYTHONPATH:.

    # Launch Benchmark for inference
    numa_nodes=$(lscpu | awk '/^NUMA node\(s\)/{ print $3 }')

    function run_inference_without_numabind() {
      cd /TF/models-1.8.1/benchmarks/
      python3 launch_benchmark.py \
        --model-name=bert_large \
        --precision=fp32 \
        --mode=inference \
        --framework=tensorflow \
        --batch-size=32 \
        --data-location $BERT_LARGE_DIR/wwm_uncased_L-24_H-1024_A-16 \
        --checkpoint $BERT_LARGE_DIR/bert_large_checkpoints \
        --output-dir $BERT_LARGE_OUTPUT/bert-squad-output \
        --verbose \
        -- infer_option=SQuAD \
           DEBIAN_FRONTEND=noninteractive \
           predict_file=$SQUAD_DIR/dev-v1.1.json \
           experimental-gelu=False \
           init_checkpoint=model.ckpt-3649
    }

    function run_inference_with_numabind() {
      cd /TF/models-1.8.1/benchmarks/
      nohup python3 launch_benchmark.py \
        --model-name=bert_large \
        --precision=fp32 \
        --mode=inference \
        --framework=tensorflow \
        --batch-size=32 \
        --socket-id 0  \
        --data-location $BERT_LARGE_DIR/wwm_uncased_L-24_H-1024_A-16 \
        --checkpoint $BERT_LARGE_DIR/bert_large_checkpoints \
        --output-dir $BERT_LARGE_OUTPUT/bert-squad-output \
        --verbose \
        -- infer_option=SQuAD \
           DEBIAN_FRONTEND=noninteractive \
           predict_file=$SQUAD_DIR/dev-v1.1.json \
           experimental-gelu=False \
           init_checkpoint=model.ckpt-3649 >> socket0-inference-log &

       python3 launch_benchmark.py \
        --model-name=bert_large \
        --precision=fp32 \
        --mode=inference \
        --framework=tensorflow \
        --batch-size=32 \
        --socket-id 1 \
        --data-location $BERT_LARGE_DIR/wwm_uncased_L-24_H-1024_A-16 \
        --checkpoint $BERT_LARGE_DIR/bert_large_checkpoints \
        --output-dir $BERT_LARGE_OUTPUT/bert-squad-output \
        --verbose \
        -- infer_option=SQuAD \
           DEBIAN_FRONTEND=noninteractive \
           predict_file=$SQUAD_DIR/dev-v1.1.json \
           experimental-gelu=False \
           init_checkpoint=model.ckpt-3649
    }

    if [ "$numa_nodes" = "1" ];then
            run_inference_without_numabind
    else
            run_inference_with_numabind
    fi""")
    
  os.chmod(inference, 555)
  p = subprocess.Popen([inference], stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  directory_to_second_numa_info = Path("/sys/devices/system/node/node1")

  
  if  directory_to_second_numa_info.exists():
    # 2 NUMA nodes
    for line in iter(p.stdout.readline, ''):
      if b'Reading package lists...' in line or b'INFO:tensorflow:tokens' in line or b'INFO:tensorflow:  name = bert' in line:
        print("\t\t\t\t  Preparing data ......", end='\r')
      if b"INFO:tensorflow:examples/sec" in line:
        print("\t\t\t\t  Inference started, current real-time throughput (examples/sec) : " + str(float(str(line).strip("\\n'").split(' ')[1])*2), end='\r')
      if b"throughput((num_processed_examples-threshod_examples)/Elapsedtime)" in line:
        print("\t\t\t\t  Inference finished, overall inference throughput (examples/sec) : " + str(float(str(line).strip("\\n'").split(':')[1])*2), end='\r')
      if line == b'' and p.poll() != None:
        break
  else:
    # 1 NUMA node
    for line in iter(p.stdout.readline, ''):
      if b'Reading package lists...' in line or b'INFO:tensorflow:tokens' in line or b'INFO:tensorflow:  name = bert' in line:
        print("\t\t\t\t  Preparing data ......", end='\r')
      if b"INFO:tensorflow:examples/sec" in line:
        print("\t\t\t\t  Inference started, current real-time throughput (examples/sec) : " + str(line).strip("\\n'").split(' ')[1], end='\r')
      if b"throughput((num_processed_examples-threshod_examples)/Elapsedtime)" in line:
        print("\t\t\t\t  Inference finished, overall inference throughput (examples/sec) : " + str(line).strip("\\n'").split(':')[1], end='\r')
      if line == b'' and p.poll() != None:
        break
       
  p.stdout.close()
  
run_inference()

## Check whether is Intel-optimized TensorFlow

This is a simple auxiliary script tool to check whether the installed TensorFlow is Intel-optimized TensorFlow. "Ture" represents Intel-optimized TensorFlow.

In [0]:
# Print version, and check whether is intel-optimized
import tensorflow
print("tensorflow version: " + tensorflow.__version__)

from packaging import version
if (version.parse("2.5.0") <= version.parse(tensorflow.__version__)):
  from tensorflow.python.util import _pywrap_util_port
  print( _pywrap_util_port.IsMklEnabled())
else:
  from tensorflow.python import _pywrap_util_port
  print(_pywrap_util_port.IsMklEnabled())