Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RES-2190: Fix labels set to -100 in finetuning tasks #517

Merged
merged 11 commits into from
May 25, 2021
7 changes: 7 additions & 0 deletions projects/transformers/experiments/finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,12 @@
)


initial_finetuning_tiny_bert50k_glue = deepcopy(finetuning_bert700k_glue)
initial_finetuning_tiny_bert50k_glue.update(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why initial_finetuning_tiny_bert50k_glue as opposed to just finetuning_tiny_bert50k_glue? What does the initial signify?

model_name_or_path="/home/ec2-user/nta/results/experiments/transformers/tiny_bert_50k"
)


finetuning_bert700k_single_task = deepcopy(finetuning_bert700k_glue)
finetuning_bert700k_single_task.update(
# logging
Expand Down Expand Up @@ -199,6 +205,7 @@
debug_finetuning_bert100k_ntasks=debug_finetuning_bert100k_ntasks,
finetuning_bert100k_glue=finetuning_bert100k_glue,
finetuning_bert100k_single_task=finetuning_bert100k_single_task,
initial_finetuning_tiny_bert50k_glue=initial_finetuning_tiny_bert50k_glue,
finetuning_bert700k_glue=finetuning_bert700k_glue,
finetuning_bert700k_single_task=finetuning_bert700k_single_task,
finetuning_bert1mi_glue=finetuning_bert1mi_glue,
Expand Down
8 changes: 6 additions & 2 deletions projects/transformers/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
from transformers.integrations import is_wandb_available
from transformers.trainer_utils import get_last_checkpoint, is_main_process

from experiments import CONFIGS
from experiments import CONFIGS
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you added an extra space at the end of CONFIGS. Make sure you run flake8 in whatever repo you're submitting a PR for to catch these issues.

cd ~/nta/nupic.research
flake8

from integrations import CustomWandbCallback
from run_args import CustomTrainingArguments, DataTrainingArguments, ModelArguments
from run_utils import (
Expand Down Expand Up @@ -299,9 +299,10 @@ def run_finetuning_single_task(
eval_dataset = tokenized_datasets[
"validation_matched" if data_args.task_name == "mnli" else "validation"
]

test_dataset = None
if ((data_args.task_name is not None or data_args.test_file is not None)
and training_args.do_predict):
and training_args.do_predict):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You removed some whitespace here. Flake8 would have caught this as well.

test_dataset = tokenized_datasets[
"test_matched" if data_args.task_name == "mnli" else "test"
]
Expand Down Expand Up @@ -329,7 +330,9 @@ def run_finetuning_single_task(
trainer_callbacks=model_args.trainer_callbacks or None,
finetuning=True, task_name=data_args.task_name, is_regression=is_regression
)

if training_args.do_train:
logging.info(f"trainer.args.dataloader_drop_last before training: {trainer.args.dataloader_drop_last}")
train(trainer, training_args.output_dir, last_checkpoint)

# Evaluate
Expand Down Expand Up @@ -400,6 +403,7 @@ def run_finetuning_multiple_tasks(
training_args.output_dir = os.path.join(
base_training_args.output_dir, task_name
)

# Update any custom training hyperparameter
# TODO: allow hyperparameter search for each task
if task_name in model_args.task_hyperparams:
Expand Down
58 changes: 36 additions & 22 deletions projects/transformers/run_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,22 @@ def evaluate_tasks(trainer, output_dir, tasks, eval_datasets):
Returns evaluation dict with results.
"""
eval_results = {}

# trainer references args.dataloader_drop_last for both train and eval dataloaders.
# We want to be able to drop last for training, but not for eval, because
# dropping at eval leaves off examples, leading to innacurate performance measures
# This is a hack to toggle args.dataloader_drop_last
drop_last=False
if trainer.args.dataloader_drop_last:
drop_last=True
trainer.args.dataloader_drop_last=False
logging.info("Switched trainer.args.dataloader_drop_last to False for evaluation")

for eval_dataset, task in zip(eval_datasets, tasks):
vals, cnts = np.unique(np.array(eval_dataset['label']), return_counts=True)
logging.info(f"Label distribution for {task} before calling trainer.evaluate")
for val in range(len(vals)):
logging.info(f"Label={vals[val]}: {cnts[val]} examples")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you intentionally mean to keep this logging? Do you think we need it?

eval_result = trainer.evaluate(eval_dataset=eval_dataset)
if task == "mnli-mm":
eval_result = {f"mm_{k}": v for k, v in eval_result.items()}
Expand All @@ -141,6 +156,11 @@ def evaluate_tasks(trainer, output_dir, tasks, eval_datasets):
logging.info(f" {key} = {value}")
writer.write(f"{key} = {value}\n")

# if you want drop_last for training, this toggles it back on
if drop_last:
trainer.args.dataloader_drop_last=True
logging.info("Switched trainer.args.dataloader_drop_last to back on for further training")

return eval_results


Expand Down Expand Up @@ -708,7 +728,10 @@ def init_trainer(
# Add specific metrics for finetuning task
if finetuning:
compute_metrics = partial(
compute_metrics_task, task_name=task_name, is_regression=is_regression,
compute_metrics_task,
task_name=task_name,
is_regression=is_regression,
output_dir=training_args.output_dir
)
trainer_kwargs.update(compute_metrics=compute_metrics)

Expand All @@ -718,7 +741,8 @@ def init_trainer(


def compute_metrics_task(ep: EvalPrediction, metric=None,
task_name=None, is_regression=None):
task_name=None, is_regression=None,
output_dir=None):
"""
You can define your custom compute_metrics function. It takes an
`EvalPrediction` object (a namedtuple with a predictions and label_ids
Expand All @@ -727,41 +751,31 @@ def compute_metrics_task(ep: EvalPrediction, metric=None,
preds = (ep.predictions[0] if isinstance(ep.predictions, tuple) else ep.predictions)
preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)

if not is_regression:
logging.info(f"Label distribution for {task_name} before cleaning")
logging.info(f"Predictions: {Counter(preds).most_common()}")
logging.info(f"Labels: {Counter(ep.label_ids).most_common()}")

# Ignore the -100 labels - not able to tokenize?
# TODO: investigate why a few labels are -100 in all tasks
label_ids = ep.label_ids[np.where(ep.label_ids != -100)]
preds = preds[np.where(ep.label_ids != -100)]
logging.info(f"Removing {1-(len(label_ids) / len(ep.label_ids)):.2%} samples "
"from evaluation set where label == -100")
assert -100 not in ep.label_ids, "unknown source of -100 labels"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit pick: Could you fix the punctuation? "Unknown source of -100 labels."


if not is_regression:
logging.info(f"Label distribution for {task_name} after cleaning")
logging.info(f"Label distribution for {task_name}")
logging.info(f"Predictions: {Counter(preds).most_common()}")
logging.info(f"Labels: {Counter(label_ids).most_common()}")
logging.info(f"Labels: {Counter(ep.label_ids).most_common()}")

if task_name is not None:
if task_name == "cola":
result = {"matthews_correlation": matthews_corrcoef(label_ids, preds)}
result = {"matthews_correlation": matthews_corrcoef(ep.label_ids, preds)}
elif task_name == "stsb":
result = pearson_and_spearman(preds, label_ids)
result = pearson_and_spearman(preds, ep.label_ids)
elif task_name in ["mrpc", "qqp"]:
result = acc_and_f1(preds, label_ids)
result = acc_and_f1(preds, ep.label_ids)
elif task_name in ["sst2", "mnli", "mnli-mm", "mnli_mismatched", "mnli_matched",
"qnli", "rte", "wnli", "hans"]:
result = {"accuracy": simple_accuracy(preds, label_ids)}
result = {"accuracy": simple_accuracy(preds, ep.label_ids)}
# Consolidate if more than one metric
if len(result) > 1:
result["combined_score"] = np.mean(list(result.values())).item()
return result
elif is_regression:
return {"mse": ((preds - label_ids) ** 2).mean().item()}
return {"mse": ((preds - ep.label_ids) ** 2).mean().item()}
else:
return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}
return {"accuracy": (preds == ep.label_ids).astype(np.float32).mean().item()}


def simple_accuracy(preds, labels):
Expand Down Expand Up @@ -916,7 +930,7 @@ def model_init():
"HP search with saved models not supported."
logging.info("Pretraining new model from scratch")

# Instantiate model; possibly one of our custom sparse models.
# Instantiate model; possibly one of our custom sparse models.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The indentation looks like it was changed here as well.

config_cls = CUSTOM_CONFIG_MAPPING[config.model_type]
model_for_lm_cls = CUSTOM_MASKED_LM_MAPPING[config_cls]
model = model_for_lm_cls(config)
Expand Down