Skip to content

Commit

Permalink
Merge pull request #521 from benja-matic/master
Browse files Browse the repository at this point in the history
Better drop_last fix, updated results table (RES-2222)
  • Loading branch information
benja-matic committed Jun 2, 2021
2 parents e35bea8 + 765d02f commit e8c0c3b
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 38 deletions.
6 changes: 3 additions & 3 deletions projects/transformers/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ In progress, current results:
| | Average w/o wnli | Average all tasks | Matthew's corr | Matched acc./Mismatched acc. | F1/Accuracy | Accuracy | Accuracy/F1 | Accuracy | Accuracy | Person/Spearman corr. | Accuracy | | log(perplexity) |
| bert_HF | 81.67 | 78.85 | 56.53 | 83.91/84.10 | 88.85/84.07 | 90.66 | 90.71/87.49 | 65.70 | 92.32 | 88.64/88.48 | 56.34 | | |
| bert_paper| 79.60 | - | 52.10 | 84.60/83.40 | 88.90/- | 90.50 | 71.20/- | 66.40 | 93.50 | 85.80 | - | 3.99 (RoBERTa) | 1.384 |
| bert_1mi | 80.76 | 76.82 | 48.87 | 84.08/84.57 | 89.76/85.68 | 91.19 | 90.58/87.17 | 66.02 | 91.44 | 87.67/87.54 | 45.31 | 5.013 | 1.612 |
| bert_100k | 75.71 | 72.51 | 40.98 | 78.26/78.65 | 84.05/77.86 | 87.74 | 89.12/85.25 | 58.2 | 88.31 | 83.90/83.80 | 46.88 | 8.619 | 2.154 |
| sparse_80%_kd_onecycle_lr_rigl | 75.3 | 72.83 | 38.29 | 79.72/80.88 | 87.31/82.81 | 87.65 | 89.19/85.25 | 54.3 | 88.89 | 80.74/80.59 | 53.12 | 8.482 | 2.138 |
| bert_1mi | 80.13 | 77.17 | 45.81 | 84.27/84.63 | 88.26/83.82 | 91.21 | 90.54/87.20 | 65.34 | 91.86 | 87.41/87.43 | 53.52 | 5.013 | 1.612 |
| bert_100k | 75.36 | 71.68 | 39.56 | 78.88/79.08 | 82.71/76.23 | 87.77 | 89.31/85.57 | 58.12 | 87.61 | 83.95/83.84 | 42.25 | 8.619 | 2.154 |
| sparse_80%_kd_onecycle_lr_rigl | 75.3 | 72.57 | 36.49 | 79.23/79.66 | 86.55/81.86 | 88.23 | 89.39/85.64 | 54.51 | 90.6 | 81.31/81.24 | 50.7 | 8.482 | 2.138 |
| sparse_80%_kd_onecycle_lr | 74.17 | 72.18 | 27.2 | 78.34/79.97 | 87.29/82.55 | 88.57 | 88.91/84.97 | 58.59 | 88.77 | 79.11/79.33 | 56.25 | 9.78 | 2.28 |

<br/><br/>
Expand Down
4 changes: 2 additions & 2 deletions projects/transformers/callbacks/sparsity.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,10 +144,10 @@ def on_step_end(self, args, state, control, model=None, **kwargs):
return

# Plot densities for each layer.
df_dendity_by_layer = get_density_by_layer(self.sparse_modules)
df_density_by_layer = get_density_by_layer(self.sparse_modules)
fig, ax = plt.subplots(figsize=(8, 5), constrained_layout=True)
sns.stripplot(
data=df_dendity_by_layer,
data=df_density_by_layer,
y="density",
x="layer",
hue=None,
Expand Down
57 changes: 57 additions & 0 deletions projects/transformers/experiments/finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,50 @@
model_name_or_path="/mnt/efs/results/pretrained-models/transformers-local/bert_100k", # noqa: E501
)

# The name 'simple' is in reference to the paper "On the stability of finetuning BERT"
# where they propose a "simple but hard to beat" approach
# https://openreview.net/pdf?id=nzpLWnVAyah
#
# How to decided num_train_epochs for each task:
# They recommend 20 epochs for rte, which is about 50k iterations. They also claim that
# the number of iterations is more important than dataset size. Here I aim for 50k
# iterations, unless the size of the training set is already > 50k.
#
# if len(train_dataset) < 50k
# train for ~ 50k iterations = round(50k / len(train_dataset))
# (cola, mrpc, stsb, rte, wnli)
#
# elif 50k <= len(train_dataset) < 300k
# use the default of 3 epochs
# (sst2, wnli)
#
# elif len(train_dataset) >= 300k
# train for 1 epoch
# (qqp, mnli)

finetuning_bert100k_glue_simple = deepcopy(finetuning_bert100k_glue)
finetuning_bert100k_glue_simple.update(
warmup_ratio=0.1,
task_hyperparams=dict(
cola=dict(num_train_epochs=6, num_runs=5), # 6 * 8500 ~ 50k
sst2=dict(num_runs=3), # 67k training size > 50k, default 3 epochs
mrpc=dict(num_train_epochs=14, num_runs=3), # 3700 * 14 ~ 51k
stsb=dict(num_train_epochs=8, num_runs=3), # 7000*8 > 50k
# hypothesis for qqp, mnli: training stable < 300k iterations
# more runs is better than 1 run with more epochs
qqp=dict(num_train_epochs=1, num_runs=3), # 300k >> 50k
mnli=dict(num_train_epochs=1, num_runs=3), # 300k >> 50k
qnli=dict(num_runs=3), # 100k > 50k, defualt to 3 epochs
rte=dict(num_train_epochs=20, num_runs=3), # exatly as in paper
wnli=dict(num_train_epochs=79, num_runs=3) # large n_epochs to hit > 50k
)
)

finetuning_bert1mi_glue_simple = deepcopy(finetuning_bert100k_glue_simple)
finetuning_bert1mi_glue_simple.update(
model_name_or_path="/mnt/efs/results/pretrained-models/transformers-local/bert_1mi"
)

finetuning_bert1mi_glue = deepcopy(finetuning_bert700k_glue)
finetuning_bert1mi_glue.update(
# logging
Expand All @@ -144,6 +188,16 @@
)


finetuning_bert1mi_wnli = deepcopy(finetuning_bert100k_single_task)
finetuning_bert1mi_wnli.update(
task_name=None,
task_names=["wnli"],
evaluation_strategy="steps",
eval_steps=15,
num_train_epochs=5,
)


finetuning_tiny_bert50k_glue = deepcopy(finetuning_bert700k_glue)
finetuning_tiny_bert50k_glue.update(
model_name_or_path="/home/ec2-user"
Expand Down Expand Up @@ -209,7 +263,10 @@
finetuning_tiny_bert50k_glue=finetuning_tiny_bert50k_glue,
finetuning_bert700k_glue=finetuning_bert700k_glue,
finetuning_bert700k_single_task=finetuning_bert700k_single_task,
finetuning_bert100k_glue_simple=finetuning_bert100k_glue_simple,
finetuning_bert1mi_glue=finetuning_bert1mi_glue,
finetuning_bert1mi_glue_simple=finetuning_bert1mi_glue_simple,
finetuning_bert1mi_wnli=finetuning_bert1mi_wnli,
finetuning_bert1mi_single_task=finetuning_bert1mi_single_task,
finetuning_sparse_bert_100k_glue=finetuning_sparse_bert_100k_glue,
finetuning_sparse_encoder_bert_100k_glue=finetuning_sparse_encoder_bert_100k_glue,
Expand Down
16 changes: 15 additions & 1 deletion projects/transformers/experiments/trifecta.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
RigLMixin,
)

from .finetuning import finetuning_bert700k_glue
from .finetuning import finetuning_bert100k_glue_simple, finetuning_bert700k_glue
from .sparse_bert import fully_static_sparse_bert_100k_fp16
from .sparse_bertitos import small_bert_sparse_100k, tiny_bert_sparse_100k

Expand Down Expand Up @@ -258,6 +258,18 @@ class KDLRRangeTestTrainer(LRRangeTestMixin,
model_name_or_path="/mnt/efs/results/pretrained-models/transformers-local/bert_sparse_80%_trifecta_100k", # noqa: E501
)

finetuning_bert_sparse_trifecta_100k_glue_simple = deepcopy(
finetuning_bert100k_glue_simple)
finetuning_bert_sparse_trifecta_100k_glue_simple.update(
# Model arguments
model_type="fully_static_sparse_bert",
model_name_or_path="/mnt/efs/results/pretrained-models/transformers-local/"
"bert_sparse_80%_trifecta_100k",
)

# alias with a shorter variable name for pep8 compliance below
ft_bert_sp_tri_100k_g_s = finetuning_bert_sparse_trifecta_100k_glue_simple


CONFIGS = dict(
# Tiny BERT
Expand All @@ -274,4 +286,6 @@ class KDLRRangeTestTrainer(LRRangeTestMixin,
# BERT Base
bert_sparse_trifecta_100k=bert_sparse_trifecta_100k,
finetuning_bert_sparse_trifecta_100k_glue=finetuning_bert_sparse_trifecta_100k_glue,
finetuning_bert_sparse_trifecta_100k_glue_simple=ft_bert_sp_tri_100k_g_s,

)
56 changes: 24 additions & 32 deletions projects/transformers/run_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,32 +119,11 @@ def train(trainer, output_dir, last_checkpoint=None):
))


def toggle_drop_last(trainer, should_drop_last):
"""
Turn trainer.args.dataloader_drop_last on or off depending on use case
If drop_last is left on, then you can get skewed results anytime
trainer.evaluate or trainer.predict is called, since drop_last will set
the last batch with incomplete number of samples to be labeled -100
You'll want to use this if you want drop_last on for training, but off
for testing
Example usage at evaluation time
drop_last = toggle_drop_last(trainer, False)
trainer.evaluate(...)
_ = toggle_drop_last(trainer, drop_last)
"""
if should_drop_last:
return False
else:
trainer.args.dataloader_drop_last = False
return True


def evaluate_tasks(trainer, output_dir, tasks, eval_datasets):
"""
Evaluate tasks after finetuning.
Returns evaluation dict with results.
"""
drop_last = toggle_drop_last(trainer, False) # should_drop_last=False
eval_results = {}

for eval_dataset, task in zip(eval_datasets, tasks):
Expand All @@ -163,9 +142,6 @@ def evaluate_tasks(trainer, output_dir, tasks, eval_datasets):
logging.info(f" {key} = {value}")
writer.write(f"{key} = {value}\n")

# if you want drop_last for training, this toggles it back on
_ = toggle_drop_last(trainer, drop_last)

return eval_results


Expand All @@ -174,8 +150,6 @@ def test_tasks(trainer, output_dir, tasks, test_datasets, is_regression, label_l
Test tasks after finetuning.
"""

drop_last = toggle_drop_last(trainer, False)

for test_dataset, task in zip(test_datasets, tasks):
# Removing the `label` columns because it contains -1
# and Trainer won't like that.
Expand All @@ -198,13 +172,9 @@ def test_tasks(trainer, output_dir, tasks, test_datasets, is_regression, label_l
item = label_list[item]
writer.write(f"{index}\t{item}\n")

_ = toggle_drop_last(trainer, drop_last)


def evaluate_language_model(trainer, eval_dataset, output_dir):
"""Evaluate language model. Returns dict with results on perplexity metric. """
drop_last = toggle_drop_last(trainer, False)

results = {}
eval_output = trainer.evaluate(eval_dataset)

Expand All @@ -219,8 +189,6 @@ def evaluate_language_model(trainer, eval_dataset, output_dir):
logging.info(f" {key} = {value}")
writer.write(f"{key} = {value}\n")

_ = toggle_drop_last(trainer, drop_last)

return results


Expand Down Expand Up @@ -708,6 +676,23 @@ def init_model(model_args, config, tokenizer, finetuning=False):
return model


def toggle_drop_last_wrapper(method):
"""
Return a function that turns drop_last off before it is called. Used for
ensuring trainer.args.dataloader_drop_last is False during evaluation
steps. After the method is called, dataloader_drop_last is switched back
to whatever it was set to initially.
"""
def toggle_method(*args, **kwargs):
was_drop_last = method.__self__.args.dataloader_drop_last # initial drop_last
method.__self__.args.dataloader_drop_last = False # turn drop_last off
result = method(*args, **kwargs) # call method with drop_last off
method.__self__.args.dataloader_drop_last = was_drop_last # restore drop_last
return result

return toggle_method


def init_trainer(
tokenizer,
data_collator,
Expand Down Expand Up @@ -751,6 +736,13 @@ def init_trainer(

trainer = trainer_class(**trainer_kwargs)

# Issue: labels get set to -100 due to drop_last.
# Fix: override the evaluate and predict methods.
# The previous fix covered cases when WE call trainer.{evaluate, predict}.
# This fix should cover all cases, including any time HF calls these methods.
trainer.evaluate = toggle_drop_last_wrapper(trainer.evaluate)
trainer.predict = toggle_drop_last_wrapper(trainer.predict)

return trainer


Expand Down

0 comments on commit e8c0c3b

Please sign in to comment.