Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RES-2190: Fix labels set to -100 in finetuning tasks #517

Merged
merged 11 commits into from
May 25, 2021
8 changes: 8 additions & 0 deletions projects/transformers/experiments/finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,13 @@
)


finetuning_tiny_bert50k_glue = deepcopy(finetuning_bert700k_glue)
finetuning_tiny_bert50k_glue.update(
model_name_or_path="/home/ec2-user"
"/nta/results/experiments/transformers/tiny_bert_50k"
)


finetuning_bert700k_single_task = deepcopy(finetuning_bert700k_glue)
finetuning_bert700k_single_task.update(
# logging
Expand Down Expand Up @@ -199,6 +206,7 @@
debug_finetuning_bert100k_ntasks=debug_finetuning_bert100k_ntasks,
finetuning_bert100k_glue=finetuning_bert100k_glue,
finetuning_bert100k_single_task=finetuning_bert100k_single_task,
finetuning_tiny_bert50k_glue=finetuning_tiny_bert50k_glue,
finetuning_bert700k_glue=finetuning_bert700k_glue,
finetuning_bert700k_single_task=finetuning_bert700k_single_task,
finetuning_bert1mi_glue=finetuning_bert1mi_glue,
Expand Down
13 changes: 8 additions & 5 deletions projects/transformers/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,12 +299,13 @@ def run_finetuning_single_task(
eval_dataset = tokenized_datasets[
"validation_matched" if data_args.task_name == "mnli" else "validation"
]

test_dataset = None
if ((data_args.task_name is not None or data_args.test_file is not None)
and training_args.do_predict):
test_dataset = tokenized_datasets[
"test_matched" if data_args.task_name == "mnli" else "test"
]
if (data_args.task_name is not None or data_args.test_file is not None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't understand this change, what is the difference between the old version and the new?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No semantic difference. I just couldn't figure out how to make it pep compliant. I was getting issues about the indent, or the length of the line, so just broke it up.

if training_args.do_predict:
test_dataset = tokenized_datasets[
"test_matched" if data_args.task_name == "mnli" else "test"
]

# Log fingerprint used in HF smart caching
logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}")
Expand All @@ -329,6 +330,7 @@ def run_finetuning_single_task(
trainer_callbacks=model_args.trainer_callbacks or None,
finetuning=True, task_name=data_args.task_name, is_regression=is_regression
)

if training_args.do_train:
train(trainer, training_args.output_dir, last_checkpoint)

Expand Down Expand Up @@ -400,6 +402,7 @@ def run_finetuning_multiple_tasks(
training_args.output_dir = os.path.join(
base_training_args.output_dir, task_name
)

# Update any custom training hyperparameter
# TODO: allow hyperparameter search for each task
if task_name in model_args.task_hyperparams:
Expand Down
72 changes: 51 additions & 21 deletions projects/transformers/run_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,34 @@ def train(trainer, output_dir, last_checkpoint=None):
))


def toggle_drop_last(trainer, should_drop_last):
"""
Turn trainer.args.dataloader_drop_last on or off depending on use case
If drop_last is left on, then you can get skewed results anytime
trainer.evaluate or trainer.predict is called, since drop_last will set
the last batch with incomplete number of samples to be labeled -100
You'll want to use this if you want drop_last on for training, but off
for testing
Example usage at evaluation time
drop_last = toggle_drop_last(trainer, False)
trainer.evaluate(...)
_ = toggle_drop_last(trainer, drop_last)
"""
if should_drop_last:
return False
else:
trainer.args.dataloader_drop_last = False
return True


def evaluate_tasks(trainer, output_dir, tasks, eval_datasets):
"""
Evaluate tasks after finetuning.
Returns evaluation dict with results.
"""
drop_last = toggle_drop_last(trainer, False) # should_drop_last=False
eval_results = {}

for eval_dataset, task in zip(eval_datasets, tasks):
eval_result = trainer.evaluate(eval_dataset=eval_dataset)
if task == "mnli-mm":
Expand All @@ -141,13 +163,19 @@ def evaluate_tasks(trainer, output_dir, tasks, eval_datasets):
logging.info(f" {key} = {value}")
writer.write(f"{key} = {value}\n")

# if you want drop_last for training, this toggles it back on
_ = toggle_drop_last(trainer, drop_last)

return eval_results


def test_tasks(trainer, output_dir, tasks, test_datasets, is_regression, label_list):
"""
Test tasks after finetuning.
"""

drop_last = toggle_drop_last(trainer, False)

for test_dataset, task in zip(test_datasets, tasks):
# Removing the `label` columns because it contains -1
# and Trainer won't like that.
Expand All @@ -170,9 +198,13 @@ def test_tasks(trainer, output_dir, tasks, test_datasets, is_regression, label_l
item = label_list[item]
writer.write(f"{index}\t{item}\n")

_ = toggle_drop_last(trainer, drop_last)


def evaluate_language_model(trainer, eval_dataset, output_dir):
"""Evaluate language model. Returns dict with results on perplexity metric. """
drop_last = toggle_drop_last(trainer, False)

results = {}
eval_output = trainer.evaluate(eval_dataset)

Expand All @@ -187,6 +219,8 @@ def evaluate_language_model(trainer, eval_dataset, output_dir):
logging.info(f" {key} = {value}")
writer.write(f"{key} = {value}\n")

_ = toggle_drop_last(trainer, drop_last)

return results


Expand Down Expand Up @@ -708,7 +742,10 @@ def init_trainer(
# Add specific metrics for finetuning task
if finetuning:
compute_metrics = partial(
compute_metrics_task, task_name=task_name, is_regression=is_regression,
compute_metrics_task,
task_name=task_name,
is_regression=is_regression,
output_dir=training_args.output_dir
)
trainer_kwargs.update(compute_metrics=compute_metrics)

Expand All @@ -718,7 +755,8 @@ def init_trainer(


def compute_metrics_task(ep: EvalPrediction, metric=None,
task_name=None, is_regression=None):
task_name=None, is_regression=None,
output_dir=None):
"""
You can define your custom compute_metrics function. It takes an
`EvalPrediction` object (a namedtuple with a predictions and label_ids
Expand All @@ -727,41 +765,33 @@ def compute_metrics_task(ep: EvalPrediction, metric=None,
preds = (ep.predictions[0] if isinstance(ep.predictions, tuple) else ep.predictions)
preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)

if not is_regression:
logging.info(f"Label distribution for {task_name} before cleaning")
logging.info(f"Predictions: {Counter(preds).most_common()}")
logging.info(f"Labels: {Counter(ep.label_ids).most_common()}")

# Ignore the -100 labels - not able to tokenize?
# TODO: investigate why a few labels are -100 in all tasks
label_ids = ep.label_ids[np.where(ep.label_ids != -100)]
preds = preds[np.where(ep.label_ids != -100)]
logging.info(f"Removing {1-(len(label_ids) / len(ep.label_ids)):.2%} samples "
"from evaluation set where label == -100")
# -100 labels can come up when drop_last batch setting gets set to true during
# evaluation. That is fixed, so any -100 labels should not pass silently.
assert -100 not in ep.label_ids, "Unknown source of -100 labels"

if not is_regression:
logging.info(f"Label distribution for {task_name} after cleaning")
logging.info(f"Label distribution for {task_name}")
logging.info(f"Predictions: {Counter(preds).most_common()}")
logging.info(f"Labels: {Counter(label_ids).most_common()}")
logging.info(f"Labels: {Counter(ep.label_ids).most_common()}")

if task_name is not None:
if task_name == "cola":
result = {"matthews_correlation": matthews_corrcoef(label_ids, preds)}
result = {"matthews_correlation": matthews_corrcoef(ep.label_ids, preds)}
elif task_name == "stsb":
result = pearson_and_spearman(preds, label_ids)
result = pearson_and_spearman(preds, ep.label_ids)
elif task_name in ["mrpc", "qqp"]:
result = acc_and_f1(preds, label_ids)
result = acc_and_f1(preds, ep.label_ids)
elif task_name in ["sst2", "mnli", "mnli-mm", "mnli_mismatched", "mnli_matched",
"qnli", "rte", "wnli", "hans"]:
result = {"accuracy": simple_accuracy(preds, label_ids)}
result = {"accuracy": simple_accuracy(preds, ep.label_ids)}
# Consolidate if more than one metric
if len(result) > 1:
result["combined_score"] = np.mean(list(result.values())).item()
return result
elif is_regression:
return {"mse": ((preds - label_ids) ** 2).mean().item()}
return {"mse": ((preds - ep.label_ids) ** 2).mean().item()}
else:
return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}
return {"accuracy": (preds == ep.label_ids).astype(np.float32).mean().item()}


def simple_accuracy(preds, labels):
Expand Down