From cc0b930612f9fb99213da6da945881e243566b8d Mon Sep 17 00:00:00 2001 From: Luiz Scheinkman Date: Fri, 28 May 2021 10:14:28 -0700 Subject: [PATCH 1/2] Add deepspeed version of LR range test experiment --- .../transformers/experiments/ablations.py | 40 ++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/projects/transformers/experiments/ablations.py b/projects/transformers/experiments/ablations.py index a851df7a4..a685eac64 100644 --- a/projects/transformers/experiments/ablations.py +++ b/projects/transformers/experiments/ablations.py @@ -18,7 +18,6 @@ # # http://numenta.org/licenses/ # ---------------------------------------------------------------------- - from copy import deepcopy from ray import tune @@ -291,6 +290,42 @@ def pct_start_hp_space(trial): model_name_or_path="/mnt/efs/results/pretrained-models/transformers-local/bert_sparse_80%_kd_onecycle_lr_100k", # noqa: E501 ) +############# +# Deepspeed # +############# + +# This lr-range test is based on `bert_sparse_100k_kd_lr_range_test` and adapted +# for deepspeed training. +# With this test the best `max_lr` value found is `0.0017`. +# On four p3.16xlarge it takes ~20m to run +bert_sparse_100k_kd_lr_range_test_deepspeed = deepcopy(bert_sparse_100k_kd_lr_range_test) # noqa: E501 +bert_sparse_100k_kd_lr_range_test_deepspeed.update( + max_steps=100, + tokenized_data_cache_dir="/mnt/datasets/huggingface/preprocessed-datasets/text", + fp16=False, # Use deepspeed FP16 instead of apex + deepspeed={ + "zero_optimization": { + "stage": 1, + }, + # When using fp16 dynamic loss scale, deepspeed will skip the optimizer + # and LR scheduler steps whenever the loss value overflows (NaN/Inf). + # Using deepspeed default values the loss will likely overflow on the + # first few steps as the dynamic loss scale warms up. When the loss + # overflows, huggingface will detect the LR scheduler step was skipped + # and return zero as the current learning rate potentially affecting the + # results of the LR range test. To avoid loss overflow during the LR + # range test you could use static loss scale or use a smaller initial + # scale power. + # See https://www.deepspeed.ai/docs/config-json/#fp16-training-options + "fp16": { + "enabled": True, + "initial_scale_power": 14, + }, + "gradient_clipping": 1.0, + "sparse_gradients": True, + "steps_per_print": 1, + } +) CONFIGS = dict( # Tiny BERT @@ -309,4 +344,7 @@ def pct_start_hp_space(trial): bert_sparse_100k_kd_oncycle_lr=bert_sparse_100k_kd_oncycle_lr, bert_sparse_100k_kd_lr_range_test=bert_sparse_100k_kd_lr_range_test, finetuning_bert_sparse_kd_oncycle_lr_100k_glue=finetuning_bert_sparse_kd_oncycle_lr_100k_glue, # noqa: E501 + + # Deepspeed + bert_sparse_100k_kd_lr_range_test_deepspeed=bert_sparse_100k_kd_lr_range_test_deepspeed, # noqa: E501 ) From a0c3689d76aee8930a4a7520a7ada929c59c4292 Mon Sep 17 00:00:00 2001 From: Luiz Scheinkman Date: Sat, 29 May 2021 08:40:06 -0700 Subject: [PATCH 2/2] Fix comment --- projects/transformers/experiments/ablations.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/transformers/experiments/ablations.py b/projects/transformers/experiments/ablations.py index a685eac64..ad2dfb713 100644 --- a/projects/transformers/experiments/ablations.py +++ b/projects/transformers/experiments/ablations.py @@ -290,9 +290,9 @@ def pct_start_hp_space(trial): model_name_or_path="/mnt/efs/results/pretrained-models/transformers-local/bert_sparse_80%_kd_onecycle_lr_100k", # noqa: E501 ) -############# -# Deepspeed # -############# +# --------- +# Deepspeed +# --------- # This lr-range test is based on `bert_sparse_100k_kd_lr_range_test` and adapted # for deepspeed training.