Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MLCube packing: Bert benchmark [WIP] #503

Closed
wants to merge 11 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions language_model/tensorflow/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
workspace/data/
45 changes: 45 additions & 0 deletions language_model/tensorflow/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Bert benchmark

## MLCube execution

### Project setup
```Python
# Create Python environment and install MLCube Docker runner
virtualenv -p python3 ./env && source ./env/bin/activate && pip install mlcube-docker
```

## Clone Training repo and go to Bert directory
```
git clone https://github.com/mlcommons/training.git && cd ./training
git fetch origin pull/503/head:feature/bert_mlcube && git checkout feature/bert_mlcube
cd ./language_model/tensorflow
```

## Run Bert MLCube on a local machine with Docker runner

```bash
# Run Bert tasks: download, extract, preprocess, generate_tfrecords and train
mlcube run --task download
mlcube run --task extract
mlcube run --task preprocess
mlcube run --task generate_tfrecords
mlcube run --task train
```

We are targeting pull-type installation, so MLCubes should be available on docker hub. If not, try this:

```bash
mlcube run ... -Pdocker.build_strategy=auto
```

Also, users can override the workspace directory by using:

```bash
mlcube run --task=download --workspace=absolute_path_to_custom_dir
```

We are targeting pull-type installation, so MLCube images should be available on docker hub. If not, try this:

```bash
mlcube run ... -Pdocker.build_strategy=always
```
16 changes: 16 additions & 0 deletions language_model/tensorflow/bert/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04
FROM tensorflow/tensorflow:1.15.2-gpu

RUN apt-get update
RUN apt-get update && apt-get install -y --no-install-recommends time \
ca-certificates \
build-essential \
git \
bzip2

COPY requirements.txt /requirements.txt
RUN pip install --no-cache-dir -r /requirements.txt

COPY . /workspace
WORKDIR /workspace
ENTRYPOINT ["python", "mlcube.py"]
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

pip install --user gdown

mkdir -p wiki
data_dir=${DATA_DIR:-./}
mkdir -p $data_dir/wiki

cd wiki
cd $data_dir/wiki

# Downloading files from Google Drive location: https://drive.google.com/drive/folders/1oQF4diVHNPCclykwdvQJw8n_VIWwV0PT

Expand All @@ -23,6 +24,8 @@ gdown https://drive.google.com/uc?id=14_A6gQ0NJ7Pay1X0xFq9rCKUuFJcKLF-
# enwiki-20200101-pages-articles-multistream.xml.bz2
gdown https://drive.google.com/uc?id=18K1rrNJ_0lSR9bsLaoP3PkQeSFO-9LE7

echo uncompressing enwiki-20200101-pages-articles-multistream.xml.bz2
echo this may take a while...
bzip2 -d enwiki-20200101-pages-articles-multistream.xml.bz2

# Download TF-1 checkpoints
Expand Down Expand Up @@ -51,6 +54,3 @@ gdown https://drive.google.com/uc?id=1oVBgtSxkXC9rH2SXJv85RXR9-WrMPy-Q

# Back to bert/cleanup_scripts
cd ../..



Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash

data_dir=${DATA_DIR:-./}
wiki_dir=$data_dir/wiki/
results_dir=$data_dir/results/
tfrecord_dir=$data_dir/tfrecord/

mkdir -p $tfrecord_dir

echo "Processing train data"
# Generate one TFRecord for each results_dir/part-00XXX-of-00500 file.
for file in $results_dir/*
do
if [[ $file == *"part"* ]]; then
echo "Processing file: $file"
python create_pretraining_data.py \
--input_file=$file \
--output_file=$tfrecord_dir/${file##*/} \
--vocab_file=$wiki_dir/vocab.txt \
--do_lower_case=True \
--max_seq_length=512 \
--max_predictions_per_seq=76 \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor=10
fi
done

echo "Processing eval data"
python create_pretraining_data.py \
--input_file=$results_dir/eval.txt \
--output_file=$tfrecord_dir/eval_intermediate \
--vocab_file=$wiki_dir/vocab.txt \
--do_lower_case=True \
--max_seq_length=512 \
--max_predictions_per_seq=76 \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor=10

python3 pick_eval_samples.py \
--input_tfrecord=$tfrecord_dir/eval_intermediate \
--output_tfrecord=$tfrecord_dir/eval_10k \
--num_examples_to_pick=10000
18 changes: 14 additions & 4 deletions language_model/tensorflow/bert/cleanup_scripts/process_wiki.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,39 @@
# example: ./process_wiki.sh 'sample_data/wiki_??'
# The resulted files will be placed in ./results

inputs=$1
data_dir=${DATA_DIR:-./}
text_dir=$data_dir/text/
inputs="${text_dir}*/wiki_??"

ls $inputs

pip install nltk==3.4.5

# Remove doc tag and title
echo "RUNNING SCRIPT #1: cleanup_file.py"
python ./cleanup_file.py --data=$inputs --output_suffix='.1'

# Further clean up files
echo "RUNNING SCRIPT #2: clean.sh"
for f in ${inputs}; do
./clean.sh ${f}.1 ${f}.2
done

# Sentence segmentation
echo "RUNNING SCRIPT #3: do_sentence_segmentation.py"
python ./do_sentence_segmentation.py --data=$inputs --input_suffix='.2' --output_suffix='.3'

mkdir -p ./results
result_dir=$data_dir/results
mkdir -p $result_dir

# Train/Eval seperation
python ./seperate_test_set.py --data=$inputs --input_suffix='.3' --output_suffix='.4' --num_test_articles=10000 --test_output='./results/eval'
echo "RUNNING SCRIPT #4: seperate_test_set.py"
python ./seperate_test_set.py --data=$inputs --input_suffix='.3' --output_suffix='.4' --num_test_articles=10000 --test_output="${result_dir}/eval"

## Choose file size method or number of packages by uncommenting only one of the following do_gather options
# Gather into fixed size packages
python ./do_gather.py --data=$inputs --input_suffix='.4' --block_size=26.92 --out_dir='./results'
echo "RUNNING SCRIPT #5: do_gather.py"
python ./do_gather.py --data=$inputs --input_suffix='.4' --block_size=26.92 --out_dir=$result_dir

# Gather into fixed number of packages
#NUM_PACKAGES=512
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

git clone https://github.com/attardi/wikiextractor.git

cd wikiextractor

git checkout 3162bb6c3c9ebd2d15be507aa11d6fa818a454ac

# Back to <bert>/cleanup_scripts
cd ..

# Run `WikiExtractor.py` to extract data from XML.
data_dir=${DATA_DIR:-./}
wiki_dir=$data_dir/wiki
python wikiextractor/WikiExtractor.py $wiki_dir/enwiki-20200101-pages-articles-multistream.xml -o $data_dir/text
128 changes: 128 additions & 0 deletions language_model/tensorflow/bert/mlcube.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""MLCube handler file"""
import os
import shutil
import subprocess
from pathlib import Path

import typer
import yaml

app = typer.Typer()


class DownloadTask(object):
"""Download task Class
It defines the environment variables:
DATA_ROOT_DIR: Directory path to download the dataset
Then executes the download script"""
@staticmethod
def run(data_dir: str) -> None:

env = os.environ.copy()
env.update({
'DATA_DIR': data_dir,
})

process = subprocess.Popen(
"./cleanup_scripts/download_and_uncompress.sh", cwd=".", env=env)
process.wait()


class ExtractTask(object):
"""Extract task Class
It defines the environment variables:
DATA_ROOT_DIR: Directory path to download the dataset
Then executes the download script"""
@staticmethod
def run(data_dir: str) -> None:

env = os.environ.copy()
env.update({
'DATA_DIR': data_dir,
})

process = subprocess.Popen(
"./cleanup_scripts/run_wiki_extractor.sh", cwd=".", env=env)
process.wait()


class PreprocessTask(object):
"""Preprocess task Class
It defines the environment variables:
DATA_ROOT_DIR: Directory path to download the dataset
Then executes the download script"""
@staticmethod
def run(data_dir: str) -> None:

env = os.environ.copy()
env.update({
'DATA_DIR': data_dir,
})
process = subprocess.Popen(
"./process_wiki.sh", cwd="./cleanup_scripts", env=env)
process.wait()


class GenerateTfrecordsTask(object):
"""Preprocess task Class
It defines the environment variables:
DATA_ROOT_DIR: Directory path to download the dataset
Then executes the download script"""
@staticmethod
def run(data_dir: str) -> None:

env = os.environ.copy()
env.update({
'DATA_DIR': data_dir,
})
process = subprocess.Popen(
"./generate_tfrecords.sh", cwd="./cleanup_scripts", env=env)
process.wait()


class TrainTask(object):
"""Preprocess dataset task Class
It defines the environment variables:
DATA_DIR: Dataset directory path
All other parameters are defined in the parameters_file
Then executes the benchmark script"""
@staticmethod
def run(data_dir: str, output_dir: str) -> None:
env = os.environ.copy()
env.update({
'DATA_DIR': data_dir,
'OUTPUT_DIR': output_dir
})
process = subprocess.Popen(
"./run_and_time.sh", cwd=".", env=env)
process.wait()


@app.command("download")
def download(data_dir: str = typer.Option(..., '--data_dir')):
DownloadTask.run(data_dir)


@app.command("extract")
def extract(data_dir: str = typer.Option(..., '--data_dir')):
ExtractTask.run(data_dir)


@app.command("preprocess")
def preprocess(data_dir: str = typer.Option(..., '--data_dir')):
PreprocessTask.run(data_dir)


@app.command("generate_tfrecords")
def generate_tfrecords(data_dir: str = typer.Option(..., '--data_dir')):
GenerateTfrecordsTask.run(data_dir)


@app.command("train")
def train(data_dir: str = typer.Option(..., '--data_dir'),
output_dir: str = typer.Option(..., '--output_dir')):
TrainTask.run(data_dir, output_dir)


if __name__ == '__main__':
app()
5 changes: 5 additions & 0 deletions language_model/tensorflow/bert/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
PyYAML==5.4.1
typer==0.3.2
gdown==3.3.1
wheel==0.37.0
git+https://github.com/mlperf/logging.git@9aa718d525d1e8e64d32b12fe1b22133973d7063
30 changes: 30 additions & 0 deletions language_model/tensorflow/bert/run_and_time.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

data_dir=${DATA_DIR:-./cleanup_scripts}
output_dir=${OUTPUT_DIR:-/tmp/output/}
wiki_dir=$data_dir/wiki/
results_dir=$data_dir/results/
tfrecord_dir=$data_dir/tfrecord/

TF_XLA_FLAGS='--tf_xla_auto_jit=2' \
time python3 run_pretraining.py \
--bert_config_file=$wiki_dir/bert_config.json \
--output_dir=$output_dir \
--input_file="${tfrecord_dir}/part*" \
--do_train \
--do_eval \
--eval_batch_size=8 \
--init_checkpoint=./checkpoint/model.ckpt-28252 \
--iterations_per_loop=1000 \
--learning_rate=0.0001 \
--max_eval_steps=1250 \
--max_predictions_per_seq=76 \
--max_seq_length=512 \
--num_gpus=1 \
--num_train_steps=107538 \
--num_warmup_steps=1562 \
--optimizer=lamb \
--save_checkpoints_steps=1562 \
--start_warmup_step=0 \
--train_batch_size=24 \
--nouse_tpu
2 changes: 1 addition & 1 deletion language_model/tensorflow/bert/run_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,7 @@ def main(_):

# Creates session config. allow_soft_placement = True, is required for
# multi-GPU and is not harmful for other modes.
session_config = tf.compat.v1.ConfigProto(
session_config = tf.ConfigProto(
inter_op_parallelism_threads=8,
allow_soft_placement=True)

Expand Down
Loading