diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index f0f69f9e39b3..22a32e09025c 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -29,6 +29,7 @@ from datasets import ClassLabel, load_dataset, load_metric import transformers +from sparseml_utils import TokenClassificationModuleExporter from transformers import ( AutoConfig, AutoModelForTokenClassification, @@ -36,10 +37,10 @@ DataCollatorForTokenClassification, HfArgumentParser, PreTrainedTokenizerFast, - Trainer, TrainingArguments, set_seed, ) +from transformers.sparse import SparseMLTrainer, export_model, load_recipe, preprocess_state_dict from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version @@ -59,6 +60,9 @@ class ModelArguments: model_name_or_path: str = field( metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} ) + distill_teacher: Optional[str] = field( + default=None, metadata={"help": "Teacher model which needs to be a trained QA model"} + ) config_name: Optional[str] = field( default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} ) @@ -88,6 +92,19 @@ class DataTrainingArguments: Arguments pertaining to what data we are going to input our model for training and eval. """ + recipe: Optional[str] = field( + default=None, + metadata={ + "help": "Path to a SparseML sparsification recipe, see https://github.com/neuralmagic/sparseml " + "for more information" + }, + ) + onnx_export_path: Optional[str] = field( + default=None, metadata={"help": "The filename and path which will be where onnx model is outputed"} + ) + num_exported_samples: Optional[int] = field( + default=20, metadata={"help": "Number of exported samples, default to 20"} + ) task_name: Optional[str] = field(default="ner", metadata={"help": "The name of the task (ner, pos...)."}) dataset_name: Optional[str] = field( default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} @@ -278,6 +295,12 @@ def get_label_list(labels): # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. + + # Load and preprocess the state dict if the model existed (in this case we continue to train or + # evaluate the model). The preprocessing step is to restore names of parameters changed by + # QAT process. + state_dict = preprocess_state_dict(model_args.model_name_or_path) + config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, @@ -300,8 +323,20 @@ def get_label_list(labels): cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, + state_dict=state_dict, ) + teacher_model = None + if model_args.distill_teacher is not None: + teacher_model = AutoModelForTokenClassification.from_pretrained( + model_args.distill_teacher, + from_tf=bool(".ckpt" in model_args.distill_teacher), + cache_dir=model_args.cache_dir, + ) + teacher_model_parameters = filter(lambda p: p.requires_grad, teacher_model.parameters()) + params = sum([np.prod(p.size()) for p in teacher_model_parameters]) + logger.info("Teacher Model has %s parameters", params) + # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( @@ -424,8 +459,15 @@ def compute_metrics(p): "accuracy": results["overall_accuracy"], } + # Load possible existing recipe and new one passed in through command argument + existing_recipe = load_recipe(model_args.model_name_or_path) + new_recipe = data_args.recipe + # Initialize our Trainer - trainer = Trainer( + trainer = SparseMLTrainer( + model_args.model_name_or_path, + [existing_recipe, new_recipe], + teacher=teacher_model, model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, @@ -435,6 +477,11 @@ def compute_metrics(p): compute_metrics=compute_metrics, ) + # Apply recipes to the model. This is necessary given that + # sparsification methods such as QAT modified the model graph with their own learnable + # parameters. They are also restored/loaded to the model. + trainer.apply_recipes() + # Training if training_args.do_train: checkpoint = None @@ -502,6 +549,12 @@ def compute_metrics(p): trainer.push_to_hub(**kwargs) + if data_args.onnx_export_path: + logger.info("*** Export to ONNX ***") + eval_dataloader = trainer.get_eval_dataloader(eval_dataset) + exporter = TokenClassificationModuleExporter(model, output_dir=data_args.onnx_export_path) + export_model(exporter, eval_dataloader, data_args.onnx_export_path, data_args.num_exported_samples) + def _mp_fn(index): # For xla_spawn (TPUs) diff --git a/examples/pytorch/token-classification/sparseml_utils.py b/examples/pytorch/token-classification/sparseml_utils.py new file mode 100644 index 000000000000..200c68d680a2 --- /dev/null +++ b/examples/pytorch/token-classification/sparseml_utils.py @@ -0,0 +1,21 @@ +from typing import Any + +import numpy + +from sparseml.pytorch.utils import ModuleExporter +from transformers.modeling_outputs import TokenClassifierOutput + + +class TokenClassificationModuleExporter(ModuleExporter): + """ + Module exporter class for Question Answering + """ + + @classmethod + def get_output_names(self, out: Any): + if not isinstance(out, TokenClassifierOutput): + raise ValueError(f"Expected TokenClassifierOutput, got {type(out)}") + expected = ["logits"] + if numpy.any([name for name in expected if name not in out]): + raise ValueError("Expected output names not found in model output") + return expected diff --git a/src/transformers/sparse.py b/src/transformers/sparse.py index 575aa515731b..386bc93de764 100644 --- a/src/transformers/sparse.py +++ b/src/transformers/sparse.py @@ -125,6 +125,28 @@ def qat_active(self, epoch: int): return qat_start < epoch + 1 + def compute_loss(self, model, inputs, return_outputs=False): + """ + Computing loss using teacher/student distillation + """ + if not self.recipes or self.teacher is None: + return super().compute_loss(model, inputs, return_outputs=return_outputs) + student_outputs = model(**inputs) + loss = student_outputs["loss"] + + steps_in_epoch = -1 # Unused + loss = self.manager.loss_update( + loss, + model, + self.optimizer, + self.state.epoch, + steps_in_epoch, + global_step=self.state.global_step, + student_outputs=student_outputs, + teacher_inputs=inputs, + ) + return (loss, student_outputs) if return_outputs else loss + def save_model(self, output_dir: Optional[str] = None): """ Save model during or after training. Modifiers that change the model architecture will also be saved.