mlcommons · davidjurado · Aug 20, 2021 · Aug 20, 2021 · Aug 23, 2021 · Aug 23, 2021
@@ -0,0 +1 @@
+workspace/data/
@@ -0,0 +1,45 @@
+# Bert benchmark
+
+## MLCube execution
+
+### Project setup
+```Python
+# Create Python environment and install MLCube Docker runner
+virtualenv -p python3 ./env && source ./env/bin/activate && pip install mlcube-docker
+```
+
+## Clone Training repo and go to Bert directory
+```
+git clone https://github.com/mlcommons/training.git && cd ./training
+git fetch origin pull/503/head:feature/bert_mlcube && git checkout feature/bert_mlcube
+cd ./language_model/tensorflow
+```
+
+## Run Bert MLCube on a local machine with Docker runner
+
+```bash
+# Run Bert tasks: download, extract, preprocess, generate_tfrecords and train
+mlcube run --task download
+mlcube run --task extract
+mlcube run --task preprocess
+mlcube run --task generate_tfrecords
+mlcube run --task train
+```
+
+We are targeting pull-type installation, so MLCubes should be available on docker hub. If not, try this:
+
+```bash
+mlcube run ... -Pdocker.build_strategy=auto
+```
+
+Also, users can override the workspace directory by using:
+
+```bash
+mlcube run --task=download --workspace=absolute_path_to_custom_dir
+```
+
+We are targeting pull-type installation, so MLCube images should be available on docker hub. If not, try this:
+
+```bash
+mlcube run ... -Pdocker.build_strategy=always
+```
@@ -0,0 +1,16 @@
+#FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04
+FROM tensorflow/tensorflow:1.15.2-gpu
+
+RUN apt-get update
+RUN apt-get update && apt-get install -y --no-install-recommends time \
+    ca-certificates \
+    build-essential \
+    git \
+    bzip2 
+
+COPY requirements.txt /requirements.txt
+RUN pip install --no-cache-dir -r /requirements.txt
+
+COPY . /workspace
+WORKDIR /workspace
+ENTRYPOINT ["python", "mlcube.py"]
@@ -2,9 +2,10 @@
 
 pip install --user gdown
 
-mkdir -p wiki
+data_dir=${DATA_DIR:-./}
+mkdir -p $data_dir/wiki
 
-cd wiki
+cd $data_dir/wiki
 
 # Downloading files from Google Drive location: https://drive.google.com/drive/folders/1oQF4diVHNPCclykwdvQJw8n_VIWwV0PT
 
@@ -23,6 +24,8 @@ gdown https://drive.google.com/uc?id=14_A6gQ0NJ7Pay1X0xFq9rCKUuFJcKLF-
 # enwiki-20200101-pages-articles-multistream.xml.bz2
 gdown https://drive.google.com/uc?id=18K1rrNJ_0lSR9bsLaoP3PkQeSFO-9LE7
 
+echo uncompressing enwiki-20200101-pages-articles-multistream.xml.bz2
+echo this may take a while...
 bzip2 -d enwiki-20200101-pages-articles-multistream.xml.bz2
 
 # Download TF-1 checkpoints
@@ -51,6 +54,3 @@ gdown https://drive.google.com/uc?id=1oVBgtSxkXC9rH2SXJv85RXR9-WrMPy-Q
 
 # Back to bert/cleanup_scripts
 cd ../..
-
-
-
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+data_dir=${DATA_DIR:-./}
+wiki_dir=$data_dir/wiki/
+results_dir=$data_dir/results/
+tfrecord_dir=$data_dir/tfrecord/
+
+mkdir -p $tfrecord_dir
+
+echo "Processing train data"
+# Generate one TFRecord for each results_dir/part-00XXX-of-00500 file.
+for file in $results_dir/*
+do
+  if [[ $file == *"part"* ]]; then
+    echo "Processing file: $file"
+    python create_pretraining_data.py \
+    --input_file=$file \
+    --output_file=$tfrecord_dir/${file##*/} \
+    --vocab_file=$wiki_dir/vocab.txt \
+    --do_lower_case=True \
+    --max_seq_length=512 \
+    --max_predictions_per_seq=76 \
+    --masked_lm_prob=0.15 \
+    --random_seed=12345 \
+    --dupe_factor=10
+  fi
+done
+
+echo "Processing eval data"
+python create_pretraining_data.py \
+  --input_file=$results_dir/eval.txt \
+  --output_file=$tfrecord_dir/eval_intermediate \
+  --vocab_file=$wiki_dir/vocab.txt \
+  --do_lower_case=True \
+  --max_seq_length=512 \
+  --max_predictions_per_seq=76 \
+  --masked_lm_prob=0.15 \
+  --random_seed=12345 \
+  --dupe_factor=10
+
+python3 pick_eval_samples.py \
+  --input_tfrecord=$tfrecord_dir/eval_intermediate \
+  --output_tfrecord=$tfrecord_dir/eval_10k \
+  --num_examples_to_pick=10000
@@ -5,29 +5,39 @@
 # example: ./process_wiki.sh 'sample_data/wiki_??'
 # The resulted files will be placed in ./results
 
-inputs=$1
+data_dir=${DATA_DIR:-./}
+text_dir=$data_dir/text/
+inputs="${text_dir}*/wiki_??"
+
+ls $inputs
 
 pip install nltk==3.4.5
 
 # Remove doc tag and title
+echo "RUNNING SCRIPT #1: cleanup_file.py"
 python ./cleanup_file.py --data=$inputs --output_suffix='.1'
 
 # Further clean up files
+echo "RUNNING SCRIPT #2: clean.sh"
 for f in ${inputs}; do
   ./clean.sh ${f}.1 ${f}.2
 done
 
 # Sentence segmentation
+echo "RUNNING SCRIPT #3: do_sentence_segmentation.py"
 python ./do_sentence_segmentation.py --data=$inputs --input_suffix='.2' --output_suffix='.3'
 
-mkdir -p ./results
+result_dir=$data_dir/results
+mkdir -p $result_dir
 
 # Train/Eval seperation
-python ./seperate_test_set.py --data=$inputs --input_suffix='.3' --output_suffix='.4' --num_test_articles=10000 --test_output='./results/eval'
+echo "RUNNING SCRIPT #4: seperate_test_set.py"
+python ./seperate_test_set.py --data=$inputs --input_suffix='.3' --output_suffix='.4' --num_test_articles=10000 --test_output="${result_dir}/eval"
 
 ## Choose file size method or number of packages by uncommenting only one of the following do_gather options
 # Gather into fixed size packages
-python ./do_gather.py --data=$inputs --input_suffix='.4' --block_size=26.92 --out_dir='./results'
+echo "RUNNING SCRIPT #5: do_gather.py"
+python ./do_gather.py --data=$inputs --input_suffix='.4' --block_size=26.92 --out_dir=$result_dir
 
 # Gather into fixed number of packages
 #NUM_PACKAGES=512

@@ -0,0 +1,15 @@
+#!/bin/bash
+
+git clone https://github.com/attardi/wikiextractor.git
+
+cd wikiextractor
+
+git checkout 3162bb6c3c9ebd2d15be507aa11d6fa818a454ac
+
+# Back to <bert>/cleanup_scripts
+cd .. 
+
+# Run `WikiExtractor.py` to extract data from XML.
+data_dir=${DATA_DIR:-./}
+wiki_dir=$data_dir/wiki
+python wikiextractor/WikiExtractor.py $wiki_dir/enwiki-20200101-pages-articles-multistream.xml -o $data_dir/text
@@ -0,0 +1,128 @@
+"""MLCube handler file"""
+import os
+import shutil
+import subprocess
+from pathlib import Path
+
+import typer
+import yaml
+
+app = typer.Typer()
+
+
+class DownloadTask(object):
+    """Download task Class
+    It defines the environment variables:
+        DATA_ROOT_DIR: Directory path to download the dataset
+    Then executes the download script"""
+    @staticmethod
+    def run(data_dir: str) -> None:
+
+        env = os.environ.copy()
+        env.update({
+            'DATA_DIR': data_dir,
+        })
+
+        process = subprocess.Popen(
+            "./cleanup_scripts/download_and_uncompress.sh", cwd=".", env=env)
+        process.wait()
+
+
+class ExtractTask(object):
+    """Extract task Class
+    It defines the environment variables:
+        DATA_ROOT_DIR: Directory path to download the dataset
+    Then executes the download script"""
+    @staticmethod
+    def run(data_dir: str) -> None:
+
+        env = os.environ.copy()
+        env.update({
+            'DATA_DIR': data_dir,
+        })
+
+        process = subprocess.Popen(
+            "./cleanup_scripts/run_wiki_extractor.sh", cwd=".", env=env)
+        process.wait()
+
+
+class PreprocessTask(object):
+    """Preprocess task Class
+    It defines the environment variables:
+        DATA_ROOT_DIR: Directory path to download the dataset
+    Then executes the download script"""
+    @staticmethod
+    def run(data_dir: str) -> None:
+
+        env = os.environ.copy()
+        env.update({
+            'DATA_DIR': data_dir,
+        })
+        process = subprocess.Popen(
+            "./process_wiki.sh", cwd="./cleanup_scripts", env=env)
+        process.wait()
+
+
+class GenerateTfrecordsTask(object):
+    """Preprocess task Class
+    It defines the environment variables:
+        DATA_ROOT_DIR: Directory path to download the dataset
+    Then executes the download script"""
+    @staticmethod
+    def run(data_dir: str) -> None:
+
+        env = os.environ.copy()
+        env.update({
+            'DATA_DIR': data_dir,
+        })
+        process = subprocess.Popen(
+            "./generate_tfrecords.sh", cwd="./cleanup_scripts", env=env)
+        process.wait()
+
+
+class TrainTask(object):
+    """Preprocess dataset task Class
+    It defines the environment variables:
+        DATA_DIR: Dataset directory path
+        All other parameters are defined in the parameters_file
+    Then executes the benchmark script"""
+    @staticmethod
+    def run(data_dir: str, output_dir: str) -> None:
+        env = os.environ.copy()
+        env.update({
+            'DATA_DIR': data_dir,
+            'OUTPUT_DIR': output_dir
+        })
+        process = subprocess.Popen(
+            "./run_and_time.sh", cwd=".", env=env)
+        process.wait()
+
+
+@app.command("download")
+def download(data_dir: str = typer.Option(..., '--data_dir')):
+    DownloadTask.run(data_dir)
+
+
+@app.command("extract")
+def extract(data_dir: str = typer.Option(..., '--data_dir')):
+    ExtractTask.run(data_dir)
+
+
+@app.command("preprocess")
+def preprocess(data_dir: str = typer.Option(..., '--data_dir')):
+    PreprocessTask.run(data_dir)
+
+
+@app.command("generate_tfrecords")
+def generate_tfrecords(data_dir: str = typer.Option(..., '--data_dir')):
+    GenerateTfrecordsTask.run(data_dir)
+
+
+@app.command("train")
+def train(data_dir: str = typer.Option(..., '--data_dir'),
+          output_dir: str = typer.Option(..., '--output_dir')):
+    TrainTask.run(data_dir, output_dir)
+
+
+if __name__ == '__main__':
+    app()
@@ -0,0 +1,5 @@
+PyYAML==5.4.1
+typer==0.3.2
+gdown==3.3.1
+wheel==0.37.0
+git+https://github.com/mlperf/logging.git@9aa718d525d1e8e64d32b12fe1b22133973d7063
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+data_dir=${DATA_DIR:-./cleanup_scripts}
+output_dir=${OUTPUT_DIR:-/tmp/output/}
+wiki_dir=$data_dir/wiki/
+results_dir=$data_dir/results/
+tfrecord_dir=$data_dir/tfrecord/
+
+TF_XLA_FLAGS='--tf_xla_auto_jit=2' \
+time python3 run_pretraining.py \
+  --bert_config_file=$wiki_dir/bert_config.json \
+  --output_dir=$output_dir \
+  --input_file="${tfrecord_dir}/part*" \
+  --do_train \
+  --do_eval \
+  --eval_batch_size=8 \
+  --init_checkpoint=./checkpoint/model.ckpt-28252 \
+  --iterations_per_loop=1000 \
+  --learning_rate=0.0001 \
+  --max_eval_steps=1250 \
+  --max_predictions_per_seq=76 \
+  --max_seq_length=512 \
+  --num_gpus=1 \
+  --num_train_steps=107538 \
+  --num_warmup_steps=1562 \
+  --optimizer=lamb \
+  --save_checkpoints_steps=1562 \
+  --start_warmup_step=0 \
+  --train_batch_size=24 \
+  --nouse_tpu
@@ -537,7 +537,7 @@ def main(_):
 
     # Creates session config. allow_soft_placement = True, is required for
     # multi-GPU and is not harmful for other modes.
-    session_config = tf.compat.v1.ConfigProto(
+    session_config = tf.ConfigProto(
         inter_op_parallelism_threads=8,
         allow_soft_placement=True)