From b55af8656f3e50a37bff6f46d58e76c529bff64a Mon Sep 17 00:00:00 2001 From: Ben Cohen Date: Tue, 18 May 2021 15:47:09 -0400 Subject: [PATCH 1/7] Changer HF trainer.args as you go to avoid drop_last leaving off evaluation samples --- .../transformers/experiments/finetuning.py | 7 ++ projects/transformers/run.py | 8 +- projects/transformers/run_utils.py | 74 ++++++++++++++----- 3 files changed, 68 insertions(+), 21 deletions(-) diff --git a/projects/transformers/experiments/finetuning.py b/projects/transformers/experiments/finetuning.py index b589a885a..dd7d99791 100644 --- a/projects/transformers/experiments/finetuning.py +++ b/projects/transformers/experiments/finetuning.py @@ -144,6 +144,12 @@ ) +initial_finetuning_tiny_bert50k_glue = deepcopy(finetuning_bert700k_glue) +initial_finetuning_tiny_bert50k_glue.update( + model_name_or_path="/home/ec2-user/nta/results/experiments/transformers/tiny_bert_50k" +) + + finetuning_bert700k_single_task = deepcopy(finetuning_bert700k_glue) finetuning_bert700k_single_task.update( # logging @@ -199,6 +205,7 @@ debug_finetuning_bert100k_ntasks=debug_finetuning_bert100k_ntasks, finetuning_bert100k_glue=finetuning_bert100k_glue, finetuning_bert100k_single_task=finetuning_bert100k_single_task, + initial_finetuning_tiny_bert50k_glue=initial_finetuning_tiny_bert50k_glue, finetuning_bert700k_glue=finetuning_bert700k_glue, finetuning_bert700k_single_task=finetuning_bert700k_single_task, finetuning_bert1mi_glue=finetuning_bert1mi_glue, diff --git a/projects/transformers/run.py b/projects/transformers/run.py index 113b6f079..490673a86 100644 --- a/projects/transformers/run.py +++ b/projects/transformers/run.py @@ -52,7 +52,7 @@ from transformers.integrations import is_wandb_available from transformers.trainer_utils import get_last_checkpoint, is_main_process -from experiments import CONFIGS +from experiments import CONFIGS from integrations import CustomWandbCallback from run_args import CustomTrainingArguments, DataTrainingArguments, ModelArguments from run_utils import ( @@ -299,9 +299,10 @@ def run_finetuning_single_task( eval_dataset = tokenized_datasets[ "validation_matched" if data_args.task_name == "mnli" else "validation" ] + test_dataset = None if ((data_args.task_name is not None or data_args.test_file is not None) - and training_args.do_predict): + and training_args.do_predict): test_dataset = tokenized_datasets[ "test_matched" if data_args.task_name == "mnli" else "test" ] @@ -329,7 +330,9 @@ def run_finetuning_single_task( trainer_callbacks=model_args.trainer_callbacks or None, finetuning=True, task_name=data_args.task_name, is_regression=is_regression ) + if training_args.do_train: + logging.info(f"trainer.args.dataloader_drop_last before training: {trainer.args.dataloader_drop_last}") train(trainer, training_args.output_dir, last_checkpoint) # Evaluate @@ -400,6 +403,7 @@ def run_finetuning_multiple_tasks( training_args.output_dir = os.path.join( base_training_args.output_dir, task_name ) + # Update any custom training hyperparameter # TODO: allow hyperparameter search for each task if task_name in model_args.task_hyperparams: diff --git a/projects/transformers/run_utils.py b/projects/transformers/run_utils.py index 8cafca338..162b3c22c 100644 --- a/projects/transformers/run_utils.py +++ b/projects/transformers/run_utils.py @@ -125,7 +125,22 @@ def evaluate_tasks(trainer, output_dir, tasks, eval_datasets): Returns evaluation dict with results. """ eval_results = {} + + # trainer references args.dataloader_drop_last for both train and eval dataloaders. + # We want to be able to drop last for training, but not for eval, because + # dropping at eval leaves off examples, leading to innacurate performance measures + # This is a hack to toggle args.dataloader_drop_last + drop_last=False + if trainer.args.dataloader_drop_last: + drop_last=True + trainer.args.dataloader_drop_last=False + logging.info("Switched trainer.args.dataloader_drop_last to False for evaluation") + for eval_dataset, task in zip(eval_datasets, tasks): + vals, cnts = np.unique(np.array(eval_dataset['label']), return_counts=True) + logging.info(f"Label distribution for {task} before calling trainer.evaluate") + for val in range(len(vals)): + logging.info(f"Label={vals[val]}: {cnts[val]} examples") eval_result = trainer.evaluate(eval_dataset=eval_dataset) if task == "mnli-mm": eval_result = {f"mm_{k}": v for k, v in eval_result.items()} @@ -141,6 +156,11 @@ def evaluate_tasks(trainer, output_dir, tasks, eval_datasets): logging.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") + # if you want drop_last for training, this toggles it back on + if drop_last: + trainer.args.dataloader_drop_last=True + logging.info("Switched trainer.args.dataloader_drop_last to back on for further training") + return eval_results @@ -708,7 +728,10 @@ def init_trainer( # Add specific metrics for finetuning task if finetuning: compute_metrics = partial( - compute_metrics_task, task_name=task_name, is_regression=is_regression, + compute_metrics_task, + task_name=task_name, + is_regression=is_regression, + output_dir=training_args.output_dir ) trainer_kwargs.update(compute_metrics=compute_metrics) @@ -718,7 +741,8 @@ def init_trainer( def compute_metrics_task(ep: EvalPrediction, metric=None, - task_name=None, is_regression=None): + task_name=None, is_regression=None, + output_dir=None): """ You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a predictions and label_ids @@ -727,41 +751,53 @@ def compute_metrics_task(ep: EvalPrediction, metric=None, preds = (ep.predictions[0] if isinstance(ep.predictions, tuple) else ep.predictions) preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) - if not is_regression: - logging.info(f"Label distribution for {task_name} before cleaning") - logging.info(f"Predictions: {Counter(preds).most_common()}") - logging.info(f"Labels: {Counter(ep.label_ids).most_common()}") + assert -100 not in ep.label_ids, "unknown source of -100 labels" + + # if not is_regression: + # logging.info(f"Label distribution for {task_name} before cleaning") + # logging.info(f"Predictions: {Counter(preds).most_common()}") + # logging.info(f"Labels: {Counter(ep.label_ids).most_common()}") # Ignore the -100 labels - not able to tokenize? # TODO: investigate why a few labels are -100 in all tasks - label_ids = ep.label_ids[np.where(ep.label_ids != -100)] - preds = preds[np.where(ep.label_ids != -100)] - logging.info(f"Removing {1-(len(label_ids) / len(ep.label_ids)):.2%} samples " - "from evaluation set where label == -100") + + # # all cases -100 + # bad_label_idx = np.where(ep.label_ids == -100)[0] + # bad_label_ids = ep.label_ids[bad_label_idx] + + # # everything else + # label_idx = np.where(ep.label_ids != -100) + # label_ids = ep.label_ids[label_idx] + # preds = preds[label_idx] + # logging.info(f"Removing {len(bad_label_idx) / len(ep.label_ids):.2%} samples " + # "from evaluation set where label == -100") + + # cache the -100 labels for further investigation + # save_unk_labels(bad_label_ids, bad_label_idx, output_dir, task_name) if not is_regression: - logging.info(f"Label distribution for {task_name} after cleaning") + logging.info(f"Label distribution for {task_name}") logging.info(f"Predictions: {Counter(preds).most_common()}") - logging.info(f"Labels: {Counter(label_ids).most_common()}") + logging.info(f"Labels: {Counter(ep.label_ids).most_common()}") if task_name is not None: if task_name == "cola": - result = {"matthews_correlation": matthews_corrcoef(label_ids, preds)} + result = {"matthews_correlation": matthews_corrcoef(ep.label_ids, preds)} elif task_name == "stsb": - result = pearson_and_spearman(preds, label_ids) + result = pearson_and_spearman(preds, ep.label_ids) elif task_name in ["mrpc", "qqp"]: - result = acc_and_f1(preds, label_ids) + result = acc_and_f1(preds, ep.label_ids) elif task_name in ["sst2", "mnli", "mnli-mm", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]: - result = {"accuracy": simple_accuracy(preds, label_ids)} + result = {"accuracy": simple_accuracy(preds, ep.label_ids)} # Consolidate if more than one metric if len(result) > 1: result["combined_score"] = np.mean(list(result.values())).item() return result elif is_regression: - return {"mse": ((preds - label_ids) ** 2).mean().item()} + return {"mse": ((preds - ep.label_ids) ** 2).mean().item()} else: - return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()} + return {"accuracy": (preds == ep.label_ids).astype(np.float32).mean().item()} def simple_accuracy(preds, labels): @@ -916,7 +952,7 @@ def model_init(): "HP search with saved models not supported." logging.info("Pretraining new model from scratch") - # Instantiate model; possibly one of our custom sparse models. + # Instantiate model; possibly one of our custom sparse models. config_cls = CUSTOM_CONFIG_MAPPING[config.model_type] model_for_lm_cls = CUSTOM_MASKED_LM_MAPPING[config_cls] model = model_for_lm_cls(config) From 685c30ed374d6f6ff5dca242a446d516805dd5db Mon Sep 17 00:00:00 2001 From: Ben Cohen Date: Tue, 18 May 2021 16:05:47 -0400 Subject: [PATCH 2/7] remove comments --- projects/transformers/run_utils.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/projects/transformers/run_utils.py b/projects/transformers/run_utils.py index 162b3c22c..fc546110b 100644 --- a/projects/transformers/run_utils.py +++ b/projects/transformers/run_utils.py @@ -753,28 +753,6 @@ def compute_metrics_task(ep: EvalPrediction, metric=None, assert -100 not in ep.label_ids, "unknown source of -100 labels" - # if not is_regression: - # logging.info(f"Label distribution for {task_name} before cleaning") - # logging.info(f"Predictions: {Counter(preds).most_common()}") - # logging.info(f"Labels: {Counter(ep.label_ids).most_common()}") - - # Ignore the -100 labels - not able to tokenize? - # TODO: investigate why a few labels are -100 in all tasks - - # # all cases -100 - # bad_label_idx = np.where(ep.label_ids == -100)[0] - # bad_label_ids = ep.label_ids[bad_label_idx] - - # # everything else - # label_idx = np.where(ep.label_ids != -100) - # label_ids = ep.label_ids[label_idx] - # preds = preds[label_idx] - # logging.info(f"Removing {len(bad_label_idx) / len(ep.label_ids):.2%} samples " - # "from evaluation set where label == -100") - - # cache the -100 labels for further investigation - # save_unk_labels(bad_label_ids, bad_label_idx, output_dir, task_name) - if not is_regression: logging.info(f"Label distribution for {task_name}") logging.info(f"Predictions: {Counter(preds).most_common()}") From 10171b62e1013f43b98dad26d38afc7135dc1eec Mon Sep 17 00:00:00 2001 From: Ben Cohen Date: Wed, 19 May 2021 19:50:34 -0400 Subject: [PATCH 3/7] small comment, see if CI will accept current user.email --- projects/transformers/run_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/projects/transformers/run_utils.py b/projects/transformers/run_utils.py index fc546110b..6179ed1e9 100644 --- a/projects/transformers/run_utils.py +++ b/projects/transformers/run_utils.py @@ -751,6 +751,8 @@ def compute_metrics_task(ep: EvalPrediction, metric=None, preds = (ep.predictions[0] if isinstance(ep.predictions, tuple) else ep.predictions) preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) + # -100 labels can come up when drop_last batch setting gets set to true during + # evaluation. That is fixed, so any -100 labels should not pass silently. assert -100 not in ep.label_ids, "unknown source of -100 labels" if not is_regression: From ecfca856fc7b3f625e334c530f0090bbb6b6466a Mon Sep 17 00:00:00 2001 From: Ben Cohen Date: Wed, 19 May 2021 20:25:52 -0400 Subject: [PATCH 4/7] flake 8 fixes, adjustments based on Michaelangelo suggestions --- .../transformers/experiments/finetuning.py | 9 ++++---- projects/transformers/run.py | 15 +++++++------ projects/transformers/run_utils.py | 22 +++++++++---------- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/projects/transformers/experiments/finetuning.py b/projects/transformers/experiments/finetuning.py index dd7d99791..4f4e0ab6d 100644 --- a/projects/transformers/experiments/finetuning.py +++ b/projects/transformers/experiments/finetuning.py @@ -144,9 +144,10 @@ ) -initial_finetuning_tiny_bert50k_glue = deepcopy(finetuning_bert700k_glue) -initial_finetuning_tiny_bert50k_glue.update( - model_name_or_path="/home/ec2-user/nta/results/experiments/transformers/tiny_bert_50k" +finetuning_tiny_bert50k_glue = deepcopy(finetuning_bert700k_glue) +finetuning_tiny_bert50k_glue.update( + model_name_or_path="/home/ec2-user" + "/nta/results/experiments/transformers/tiny_bert_50k" ) @@ -205,7 +206,7 @@ debug_finetuning_bert100k_ntasks=debug_finetuning_bert100k_ntasks, finetuning_bert100k_glue=finetuning_bert100k_glue, finetuning_bert100k_single_task=finetuning_bert100k_single_task, - initial_finetuning_tiny_bert50k_glue=initial_finetuning_tiny_bert50k_glue, + finetuning_tiny_bert50k_glue=finetuning_tiny_bert50k_glue, finetuning_bert700k_glue=finetuning_bert700k_glue, finetuning_bert700k_single_task=finetuning_bert700k_single_task, finetuning_bert1mi_glue=finetuning_bert1mi_glue, diff --git a/projects/transformers/run.py b/projects/transformers/run.py index 490673a86..c352bdf06 100644 --- a/projects/transformers/run.py +++ b/projects/transformers/run.py @@ -52,7 +52,7 @@ from transformers.integrations import is_wandb_available from transformers.trainer_utils import get_last_checkpoint, is_main_process -from experiments import CONFIGS +from experiments import CONFIGS from integrations import CustomWandbCallback from run_args import CustomTrainingArguments, DataTrainingArguments, ModelArguments from run_utils import ( @@ -301,11 +301,11 @@ def run_finetuning_single_task( ] test_dataset = None - if ((data_args.task_name is not None or data_args.test_file is not None) - and training_args.do_predict): - test_dataset = tokenized_datasets[ - "test_matched" if data_args.task_name == "mnli" else "test" - ] + if (data_args.task_name is not None or data_args.test_file is not None): + if training_args.do_predict: + test_dataset = tokenized_datasets[ + "test_matched" if data_args.task_name == "mnli" else "test" + ] # Log fingerprint used in HF smart caching logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}") @@ -332,7 +332,8 @@ def run_finetuning_single_task( ) if training_args.do_train: - logging.info(f"trainer.args.dataloader_drop_last before training: {trainer.args.dataloader_drop_last}") + logging.info("trainer.args.dataloader_drop_last before training:" + "f{trainer.args.dataloader_drop_last}") train(trainer, training_args.output_dir, last_checkpoint) # Evaluate diff --git a/projects/transformers/run_utils.py b/projects/transformers/run_utils.py index 6179ed1e9..a23c74dcd 100644 --- a/projects/transformers/run_utils.py +++ b/projects/transformers/run_utils.py @@ -130,17 +130,14 @@ def evaluate_tasks(trainer, output_dir, tasks, eval_datasets): # We want to be able to drop last for training, but not for eval, because # dropping at eval leaves off examples, leading to innacurate performance measures # This is a hack to toggle args.dataloader_drop_last - drop_last=False + drop_last = False if trainer.args.dataloader_drop_last: - drop_last=True - trainer.args.dataloader_drop_last=False - logging.info("Switched trainer.args.dataloader_drop_last to False for evaluation") + drop_last = True + trainer.args.dataloader_drop_last = False + logging.info("Switched trainer.args.dataloader_drop_last" + "to False for evaluation") for eval_dataset, task in zip(eval_datasets, tasks): - vals, cnts = np.unique(np.array(eval_dataset['label']), return_counts=True) - logging.info(f"Label distribution for {task} before calling trainer.evaluate") - for val in range(len(vals)): - logging.info(f"Label={vals[val]}: {cnts[val]} examples") eval_result = trainer.evaluate(eval_dataset=eval_dataset) if task == "mnli-mm": eval_result = {f"mm_{k}": v for k, v in eval_result.items()} @@ -158,8 +155,9 @@ def evaluate_tasks(trainer, output_dir, tasks, eval_datasets): # if you want drop_last for training, this toggles it back on if drop_last: - trainer.args.dataloader_drop_last=True - logging.info("Switched trainer.args.dataloader_drop_last to back on for further training") + trainer.args.dataloader_drop_last = True + logging.info("Switched trainer.args.dataloader_drop_last" + "to back on for further training") return eval_results @@ -753,7 +751,7 @@ def compute_metrics_task(ep: EvalPrediction, metric=None, # -100 labels can come up when drop_last batch setting gets set to true during # evaluation. That is fixed, so any -100 labels should not pass silently. - assert -100 not in ep.label_ids, "unknown source of -100 labels" + assert -100 not in ep.label_ids, "Unknown source of -100 labels" if not is_regression: logging.info(f"Label distribution for {task_name}") @@ -932,7 +930,7 @@ def model_init(): "HP search with saved models not supported." logging.info("Pretraining new model from scratch") - # Instantiate model; possibly one of our custom sparse models. + # Instantiate model; possibly one of our custom sparse models. config_cls = CUSTOM_CONFIG_MAPPING[config.model_type] model_for_lm_cls = CUSTOM_MASKED_LM_MAPPING[config_cls] model = model_for_lm_cls(config) From 41f2d3a6e5b158c99ae4de7b409f83e6fb78520f Mon Sep 17 00:00:00 2001 From: Ben Cohen Date: Thu, 20 May 2021 09:34:42 -0400 Subject: [PATCH 5/7] less verbose logging --- projects/transformers/run.py | 2 -- projects/transformers/run_utils.py | 4 ---- 2 files changed, 6 deletions(-) diff --git a/projects/transformers/run.py b/projects/transformers/run.py index c352bdf06..472e65f45 100644 --- a/projects/transformers/run.py +++ b/projects/transformers/run.py @@ -332,8 +332,6 @@ def run_finetuning_single_task( ) if training_args.do_train: - logging.info("trainer.args.dataloader_drop_last before training:" - "f{trainer.args.dataloader_drop_last}") train(trainer, training_args.output_dir, last_checkpoint) # Evaluate diff --git a/projects/transformers/run_utils.py b/projects/transformers/run_utils.py index a23c74dcd..e464d935f 100644 --- a/projects/transformers/run_utils.py +++ b/projects/transformers/run_utils.py @@ -134,8 +134,6 @@ def evaluate_tasks(trainer, output_dir, tasks, eval_datasets): if trainer.args.dataloader_drop_last: drop_last = True trainer.args.dataloader_drop_last = False - logging.info("Switched trainer.args.dataloader_drop_last" - "to False for evaluation") for eval_dataset, task in zip(eval_datasets, tasks): eval_result = trainer.evaluate(eval_dataset=eval_dataset) @@ -156,8 +154,6 @@ def evaluate_tasks(trainer, output_dir, tasks, eval_datasets): # if you want drop_last for training, this toggles it back on if drop_last: trainer.args.dataloader_drop_last = True - logging.info("Switched trainer.args.dataloader_drop_last" - "to back on for further training") return eval_results From d7a605a26775c3b3e1eee098ec5ec41b3343c59b Mon Sep 17 00:00:00 2001 From: Ben Cohen Date: Mon, 24 May 2021 18:32:54 -0400 Subject: [PATCH 6/7] added a fix for every time trainer.evaluate or trainer.predictions is called --- projects/transformers/run_utils.py | 42 ++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/projects/transformers/run_utils.py b/projects/transformers/run_utils.py index e464d935f..415c87e95 100644 --- a/projects/transformers/run_utils.py +++ b/projects/transformers/run_utils.py @@ -118,23 +118,35 @@ def train(trainer, output_dir, last_checkpoint=None): *count_nonzero_params(trainer.model) )) +def toggle_drop_last(trainer, should_drop_last): + """ + Turn trainer.args.dataloader_drop_last on or off depending on use case + If drop_last is left on, then you can get skewed results anytime + trainer.evaluate or trainer.predict is called, since drop_last will set + the last batch with incomplete number of samples to be labeled -100 + You'll want to use this if you want drop_last on for training, but off + for testing + + Example usage at evaluation time + drop_last = toggle_drop_last(trainer, False) + trainer.evaluate(...) + _ = toggle_drop_last(trainer, drop_last) + """ + + if should_drop_last: + return False + else: + trainer.args.dataloader_drop_last = False + return True def evaluate_tasks(trainer, output_dir, tasks, eval_datasets): """ Evaluate tasks after finetuning. Returns evaluation dict with results. """ + drop_last = toggle_drop_last(trainer, False) # should_drop_last=False eval_results = {} - # trainer references args.dataloader_drop_last for both train and eval dataloaders. - # We want to be able to drop last for training, but not for eval, because - # dropping at eval leaves off examples, leading to innacurate performance measures - # This is a hack to toggle args.dataloader_drop_last - drop_last = False - if trainer.args.dataloader_drop_last: - drop_last = True - trainer.args.dataloader_drop_last = False - for eval_dataset, task in zip(eval_datasets, tasks): eval_result = trainer.evaluate(eval_dataset=eval_dataset) if task == "mnli-mm": @@ -152,8 +164,7 @@ def evaluate_tasks(trainer, output_dir, tasks, eval_datasets): writer.write(f"{key} = {value}\n") # if you want drop_last for training, this toggles it back on - if drop_last: - trainer.args.dataloader_drop_last = True + _ = toggle_drop_last(trainer, drop_last) return eval_results @@ -162,6 +173,9 @@ def test_tasks(trainer, output_dir, tasks, test_datasets, is_regression, label_l """ Test tasks after finetuning. """ + + drop_last = toggle_drop_last(trainer, False) + for test_dataset, task in zip(test_datasets, tasks): # Removing the `label` columns because it contains -1 # and Trainer won't like that. @@ -184,9 +198,13 @@ def test_tasks(trainer, output_dir, tasks, test_datasets, is_regression, label_l item = label_list[item] writer.write(f"{index}\t{item}\n") + _ = toggle_drop_last(trainer, drop_last) + def evaluate_language_model(trainer, eval_dataset, output_dir): """Evaluate language model. Returns dict with results on perplexity metric. """ + drop_last = toggle_drop_last(trainer, False) + results = {} eval_output = trainer.evaluate(eval_dataset) @@ -201,6 +219,8 @@ def evaluate_language_model(trainer, eval_dataset, output_dir): logging.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") + _ = toggle_drop_last(trainer, drop_last) + return results From 101a09738e7fb0de6d25f74bb9ea556d183a8a0b Mon Sep 17 00:00:00 2001 From: Ben Cohen Date: Mon, 24 May 2021 18:44:18 -0400 Subject: [PATCH 7/7] fixed style --- projects/transformers/run_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/transformers/run_utils.py b/projects/transformers/run_utils.py index 415c87e95..67f49b224 100644 --- a/projects/transformers/run_utils.py +++ b/projects/transformers/run_utils.py @@ -118,33 +118,33 @@ def train(trainer, output_dir, last_checkpoint=None): *count_nonzero_params(trainer.model) )) + def toggle_drop_last(trainer, should_drop_last): """ Turn trainer.args.dataloader_drop_last on or off depending on use case - If drop_last is left on, then you can get skewed results anytime + If drop_last is left on, then you can get skewed results anytime trainer.evaluate or trainer.predict is called, since drop_last will set the last batch with incomplete number of samples to be labeled -100 You'll want to use this if you want drop_last on for training, but off for testing - Example usage at evaluation time drop_last = toggle_drop_last(trainer, False) trainer.evaluate(...) _ = toggle_drop_last(trainer, drop_last) """ - if should_drop_last: return False else: trainer.args.dataloader_drop_last = False return True + def evaluate_tasks(trainer, output_dir, tasks, eval_datasets): """ Evaluate tasks after finetuning. Returns evaluation dict with results. """ - drop_last = toggle_drop_last(trainer, False) # should_drop_last=False + drop_last = toggle_drop_last(trainer, False) # should_drop_last=False eval_results = {} for eval_dataset, task in zip(eval_datasets, tasks):