-
Notifications
You must be signed in to change notification settings - Fork 57
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
RES-2190: Fix labels set to -100 in finetuning tasks #517
Changes from 2 commits
b55af86
685c30e
edf538b
19d3034
10171b6
f172704
ecfca85
41f2d3a
bf51acf
d7a605a
101a097
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,7 +52,7 @@ | |
from transformers.integrations import is_wandb_available | ||
from transformers.trainer_utils import get_last_checkpoint, is_main_process | ||
|
||
from experiments import CONFIGS | ||
from experiments import CONFIGS | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you added an extra space at the end of
|
||
from integrations import CustomWandbCallback | ||
from run_args import CustomTrainingArguments, DataTrainingArguments, ModelArguments | ||
from run_utils import ( | ||
|
@@ -299,9 +299,10 @@ def run_finetuning_single_task( | |
eval_dataset = tokenized_datasets[ | ||
"validation_matched" if data_args.task_name == "mnli" else "validation" | ||
] | ||
|
||
test_dataset = None | ||
if ((data_args.task_name is not None or data_args.test_file is not None) | ||
and training_args.do_predict): | ||
and training_args.do_predict): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You removed some whitespace here. Flake8 would have caught this as well. |
||
test_dataset = tokenized_datasets[ | ||
"test_matched" if data_args.task_name == "mnli" else "test" | ||
] | ||
|
@@ -329,7 +330,9 @@ def run_finetuning_single_task( | |
trainer_callbacks=model_args.trainer_callbacks or None, | ||
finetuning=True, task_name=data_args.task_name, is_regression=is_regression | ||
) | ||
|
||
if training_args.do_train: | ||
logging.info(f"trainer.args.dataloader_drop_last before training: {trainer.args.dataloader_drop_last}") | ||
train(trainer, training_args.output_dir, last_checkpoint) | ||
|
||
# Evaluate | ||
|
@@ -400,6 +403,7 @@ def run_finetuning_multiple_tasks( | |
training_args.output_dir = os.path.join( | ||
base_training_args.output_dir, task_name | ||
) | ||
|
||
# Update any custom training hyperparameter | ||
# TODO: allow hyperparameter search for each task | ||
if task_name in model_args.task_hyperparams: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -125,7 +125,22 @@ def evaluate_tasks(trainer, output_dir, tasks, eval_datasets): | |
Returns evaluation dict with results. | ||
""" | ||
eval_results = {} | ||
|
||
# trainer references args.dataloader_drop_last for both train and eval dataloaders. | ||
# We want to be able to drop last for training, but not for eval, because | ||
# dropping at eval leaves off examples, leading to innacurate performance measures | ||
# This is a hack to toggle args.dataloader_drop_last | ||
drop_last=False | ||
if trainer.args.dataloader_drop_last: | ||
drop_last=True | ||
trainer.args.dataloader_drop_last=False | ||
logging.info("Switched trainer.args.dataloader_drop_last to False for evaluation") | ||
|
||
for eval_dataset, task in zip(eval_datasets, tasks): | ||
vals, cnts = np.unique(np.array(eval_dataset['label']), return_counts=True) | ||
logging.info(f"Label distribution for {task} before calling trainer.evaluate") | ||
for val in range(len(vals)): | ||
logging.info(f"Label={vals[val]}: {cnts[val]} examples") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you intentionally mean to keep this logging? Do you think we need it? |
||
eval_result = trainer.evaluate(eval_dataset=eval_dataset) | ||
if task == "mnli-mm": | ||
eval_result = {f"mm_{k}": v for k, v in eval_result.items()} | ||
|
@@ -141,6 +156,11 @@ def evaluate_tasks(trainer, output_dir, tasks, eval_datasets): | |
logging.info(f" {key} = {value}") | ||
writer.write(f"{key} = {value}\n") | ||
|
||
# if you want drop_last for training, this toggles it back on | ||
if drop_last: | ||
trainer.args.dataloader_drop_last=True | ||
logging.info("Switched trainer.args.dataloader_drop_last to back on for further training") | ||
|
||
return eval_results | ||
|
||
|
||
|
@@ -708,7 +728,10 @@ def init_trainer( | |
# Add specific metrics for finetuning task | ||
if finetuning: | ||
compute_metrics = partial( | ||
compute_metrics_task, task_name=task_name, is_regression=is_regression, | ||
compute_metrics_task, | ||
task_name=task_name, | ||
is_regression=is_regression, | ||
output_dir=training_args.output_dir | ||
) | ||
trainer_kwargs.update(compute_metrics=compute_metrics) | ||
|
||
|
@@ -718,7 +741,8 @@ def init_trainer( | |
|
||
|
||
def compute_metrics_task(ep: EvalPrediction, metric=None, | ||
task_name=None, is_regression=None): | ||
task_name=None, is_regression=None, | ||
output_dir=None): | ||
""" | ||
You can define your custom compute_metrics function. It takes an | ||
`EvalPrediction` object (a namedtuple with a predictions and label_ids | ||
|
@@ -727,41 +751,31 @@ def compute_metrics_task(ep: EvalPrediction, metric=None, | |
preds = (ep.predictions[0] if isinstance(ep.predictions, tuple) else ep.predictions) | ||
preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) | ||
|
||
if not is_regression: | ||
logging.info(f"Label distribution for {task_name} before cleaning") | ||
logging.info(f"Predictions: {Counter(preds).most_common()}") | ||
logging.info(f"Labels: {Counter(ep.label_ids).most_common()}") | ||
|
||
# Ignore the -100 labels - not able to tokenize? | ||
# TODO: investigate why a few labels are -100 in all tasks | ||
label_ids = ep.label_ids[np.where(ep.label_ids != -100)] | ||
preds = preds[np.where(ep.label_ids != -100)] | ||
logging.info(f"Removing {1-(len(label_ids) / len(ep.label_ids)):.2%} samples " | ||
"from evaluation set where label == -100") | ||
assert -100 not in ep.label_ids, "unknown source of -100 labels" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit pick: Could you fix the punctuation? "Unknown source of -100 labels." |
||
|
||
if not is_regression: | ||
logging.info(f"Label distribution for {task_name} after cleaning") | ||
logging.info(f"Label distribution for {task_name}") | ||
logging.info(f"Predictions: {Counter(preds).most_common()}") | ||
logging.info(f"Labels: {Counter(label_ids).most_common()}") | ||
logging.info(f"Labels: {Counter(ep.label_ids).most_common()}") | ||
|
||
if task_name is not None: | ||
if task_name == "cola": | ||
result = {"matthews_correlation": matthews_corrcoef(label_ids, preds)} | ||
result = {"matthews_correlation": matthews_corrcoef(ep.label_ids, preds)} | ||
elif task_name == "stsb": | ||
result = pearson_and_spearman(preds, label_ids) | ||
result = pearson_and_spearman(preds, ep.label_ids) | ||
elif task_name in ["mrpc", "qqp"]: | ||
result = acc_and_f1(preds, label_ids) | ||
result = acc_and_f1(preds, ep.label_ids) | ||
elif task_name in ["sst2", "mnli", "mnli-mm", "mnli_mismatched", "mnli_matched", | ||
"qnli", "rte", "wnli", "hans"]: | ||
result = {"accuracy": simple_accuracy(preds, label_ids)} | ||
result = {"accuracy": simple_accuracy(preds, ep.label_ids)} | ||
# Consolidate if more than one metric | ||
if len(result) > 1: | ||
result["combined_score"] = np.mean(list(result.values())).item() | ||
return result | ||
elif is_regression: | ||
return {"mse": ((preds - label_ids) ** 2).mean().item()} | ||
return {"mse": ((preds - ep.label_ids) ** 2).mean().item()} | ||
else: | ||
return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()} | ||
return {"accuracy": (preds == ep.label_ids).astype(np.float32).mean().item()} | ||
|
||
|
||
def simple_accuracy(preds, labels): | ||
|
@@ -916,7 +930,7 @@ def model_init(): | |
"HP search with saved models not supported." | ||
logging.info("Pretraining new model from scratch") | ||
|
||
# Instantiate model; possibly one of our custom sparse models. | ||
# Instantiate model; possibly one of our custom sparse models. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The indentation looks like it was changed here as well. |
||
config_cls = CUSTOM_CONFIG_MAPPING[config.model_type] | ||
model_for_lm_cls = CUSTOM_MASKED_LM_MAPPING[config_cls] | ||
model = model_for_lm_cls(config) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why
initial_finetuning_tiny_bert50k_glue
as opposed to justfinetuning_tiny_bert50k_glue
? What does theinitial
signify?