# Run BERT-Large training workload

In [None]:
%%bash

# Download datasets, checkpoints and pre-trained model
rm -rf ~/TF/bert-large
mkdir -p  ~/TF/bert-large/SQuAD-1.1
cd ~/TF/bert-large/SQuAD-1.1
wget https://github.com/oap-project/oap-project.github.io/raw/master/resources/ai/bert/dev-v1.1.json
wget https://github.com/oap-project/oap-project.github.io/raw/master/resources/ai/bert/evaluate-v1.1.py
wget https://github.com/oap-project/oap-project.github.io/raw/master/resources/ai/bert/train-v1.1.json

cd ~/TF/bert-large
wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/bert_large_checkpoints.zip
unzip bert_large_checkpoints.zip

cd ~/TF/bert-large
wget https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip
unzip wwm_uncased_L-24_H-1024_A-16.zip

In [None]:
%%bash

# BERT-Large training
# Install necessary packages
sudo apt-get install -y numactl
sudo apt-get install -y libblacs-mpi-dev
# Create ckpt directory
rm -rf  ~/TF/bert-large/training/*
mkdir -p ~/TF/bert-large/training/BERT-Large-output
# Download IntelAI benchmark
cd ~/TF/bert-large/training
wget https://github.com/IntelAI/models/archive/refs/tags/v1.8.1.zip
unzip v1.8.1.zip
wget https://github.com/oap-project/oap-tools/raw/master/integrations/ml/databricks/benchmark/IntelAI_models_bertlarge_inference_realtime_throughput.patch
cd ./models-1.8.1/
git apply ../IntelAI_models_bertlarge_inference_realtime_throughput.patch

In [None]:
%%bash

#Bert-Large training
export SQUAD_DIR=~/TF/bert-large/SQuAD-1.1/
export BERT_LARGE_OUTPUT=~/TF/bert-large/training/BERT-Large-output
export BERT_LARGE_MODEL=~/TF/bert-large/wwm_uncased_L-24_H-1024_A-16
export PYTHONPATH=~/TF/bert-large/training/models-1.8.1/benchmarks/

cores_per_socket=$(lscpu | awk '/^Core\(s\) per socket/{ print $4 }')
numa_nodes=$(lscpu | awk '/^NUMA node\(s\)/{ print $3 }')

cd ~/TF/bert-large/training/models-1.8.1/benchmarks/

function run_training_without_numabind() {
     /anaconda/envs/azureml_py38_tensorflow/bin/python launch_benchmark.py \
        --model-name=bert_large \
        --precision=fp32 \
        --mode=training \
        --framework=tensorflow \
        --batch-size=4 \
        --benchmark-only \
        --data-location=$BERT_LARGE_MODEL \
        -- train-option=SQuAD  DEBIAN_FRONTEND=noninteractive   config_file=$BERT_LARGE_MODEL/bert_config.json   init_checkpoint=$BERT_LARGE_MODEL/bert_model.ckpt     vocab_file=$BERT_LARGE_MODEL/vocab.txt train_file=$SQUAD_DIR/train-v1.1.json     predict_file=$SQUAD_DIR/dev-v1.1.json      do-train=True learning-rate=1.5e-5   max-seq-length=384     do_predict=True warmup-steps=0     num_train_epochs=0.1     doc_stride=128      do_lower_case=False     experimental-gelu=False     mpi_workers_sync_gradients=True
    }

function run_training_with_numabind() {
    intra_thread=`expr $cores_per_socket - 2`
    /anaconda/envs/azureml_py38_tensorflow/bin/python launch_benchmark.py \
        --model-name=bert_large \
        --precision=fp32 \
        --mode=training \
        --framework=tensorflow \
        --batch-size=4 \
        --mpi_num_processes=$numa_nodes \
        --num-intra-threads=$intra_thread \
        --num-inter-threads=1 \
        --benchmark-only \
        --data-location=$BERT_LARGE_MODEL \
        --train-option=SQuAD  DEBIAN_FRONTEND=noninteractive   config_file=$BERT_LARGE_MODEL/bert_config.json init_checkpoint=$BERT_LARGE_MODEL/bert_model.ckpt     vocab_file=$BERT_LARGE_MODEL/vocab.txt train_file=$SQUAD_DIR/train-v1.1.json     predict_file=$SQUAD_DIR/dev-v1.1.json      do-train=True learning-rate=1.5e-5   max-seq-length=384     do_predict=True warmup-steps=0     num_train_epochs=0.1     doc_stride=128      do_lower_case=False     experimental-gelu=False     mpi_workers_sync_gradients=True
}
          

if [ "$numa_nodes" = "1" ];then
        run_training_without_numabind
else
        run_training_with_numabind
fi

In [None]:
%%bash

# Get the trarining result
numa_nodes=$(lscpu | awk '/^NUMA node\(s\)/{ print $3 }')
if [ "$numa_nodes" = "1" ];then
        cd ~/TF/bert-large/training/models-1.8.1/benchmarks/common/tensorflow/logs
        cat benchmark*.log | grep "throughput((num_processed_examples-threshod_examples)/Elapsedtime)"
else
        cd /home/azureuser/TF/bert-large/training/models-1.8.1/benchmarks/common/tensorflow/logs/
        # Single socket's throught
        cat benchmark*.log | grep "throughput((num_processed_examples-threshod_examples)/Elapsedtime)"
fi

In [None]:
# Print TensorFlow version, and check whether it is intel-optimized

import tensorflow
print("tensorflow version: " + tensorflow.__version__)

from packaging import version
if (version.parse("2.5.0") <= version.parse(tensorflow.__version__)):
  from tensorflow.python.util import _pywrap_util_port
  print( _pywrap_util_port.IsMklEnabled())
else:
  from tensorflow.python import _pywrap_util_port
  print(_pywrap_util_port.IsMklEnabled())