diff --git a/integrations/huggingface-transformers/recipes/bert-base-12layers_prune80.md b/integrations/huggingface-transformers/recipes/bert-base-12layers_prune80.md index caf834a9d24..b6802a2d97c 100644 --- a/integrations/huggingface-transformers/recipes/bert-base-12layers_prune80.md +++ b/integrations/huggingface-transformers/recipes/bert-base-12layers_prune80.md @@ -15,10 +15,10 @@ limitations under the License. --> --- -# General variables +# General Variables num_epochs: &num_epochs 30 -# pruning hyperparameters +# Pruning Hyperparameters init_sparsity: &init_sparsity 0.00 final_sparsity: &final_sparsity 0.80 pruning_start_epoch: &pruning_start_epoch 2 @@ -26,7 +26,7 @@ pruning_end_epoch: &pruning_end_epoch 20 update_frequency: &pruning_update_frequency 0.01 -# modifiers: +# Modifiers training_modifiers: - !EpochRangeModifier end_epoch: 30 @@ -35,12 +35,12 @@ training_modifiers: pruning_modifiers: - !GMPruningModifier params: - - re:bert.encoder.layer.([0,2,4,6,8]|11).attention.self.query.weight - - re:bert.encoder.layer.([0,2,4,6,8]|11).attention.self.key.weight - - re:bert.encoder.layer.([0,2,4,6,8]|11).attention.self.value.weight - - re:bert.encoder.layer.([0,2,4,6,8]|11).attention.output.dense.weight - - re:bert.encoder.layer.([0,2,4,6,8]|11).intermediate.dense.weight - - re:bert.encoder.layer.([0,2,4,6,8]|11).output.dense.weight + - re:bert.encoder.layer.*.attention.self.query.weight + - re:bert.encoder.layer.*.attention.self.key.weight + - re:bert.encoder.layer.*.attention.self.value.weight + - re:bert.encoder.layer.*.attention.output.dense.weight + - re:bert.encoder.layer.*.intermediate.dense.weight + - re:bert.encoder.layer.*.output.dense.weight start_epoch: *pruning_start_epoch end_epoch: *pruning_end_epoch init_sparsity: *init_sparsity @@ -52,21 +52,21 @@ pruning_modifiers: log_types: __ALL__ --- -# Bert model with pruned encoder layers +# BERT Model with Pruned Encoder Layers -This recipe defines a pruning strategy to sparsify all encoder layers of a Bert model at 80% sparsity. It was used together with knowledge distillation to create sparse model that achives 100% recovery from its baseline accuracy on the Squad dataset. -Training was done using 1 GPU at half precision using a training batch size of 16 with the +This recipe defines a pruning strategy to sparsify all encoder layers of a BERT model at 80% sparsity. It was used together with knowledge distillation to create sparse model that completely recovers the F1 metric (88.596) of the baseline model by on the SQuAD dataset. (We use the checkpoint at the end of the first 2 epochs as the baseline model for comparison, right before the pruning takes effect.) +Training was done using one V100 GPU at half precision using a training batch size of 16 with the [SparseML integration with huggingface/transformers](https://github.com/neuralmagic/sparseml/tree/main/integrations/huggingface-transformers). ## Weights and Biases -- [Sparse Bert on Squad](https://wandb.ai/neuralmagic/sparse-bert-squad/runs/18qdx7b3?workspace=user-neuralmagic) +- [Sparse BERT on SQuAD](https://wandb.ai/neuralmagic/sparse-bert-squad/runs/18qdx7b3?workspace=user-neuralmagic) ## Training To set up the training environment, follow the instructions on the [integration README](https://github.com/neuralmagic/sparseml/blob/main/integrations/huggingface-transformers/README.md). Using the `run_qa.py` script from the question-answering examples, the following command can be used to launch this recipe with distillation. -Adjust the training command below with your setup for GPU device, checkpoint saving frequency and logging options. +Adjust the training command below with your setup for GPU device, checkpoint saving frequency, and logging options. *training command* ``` @@ -91,7 +91,7 @@ python transformers/examples/pytorch/question-answering/run_qa.py \ --distill_temperature 2.0 \ --save_steps 1000 \ --save_total_limit 2 \ - --recipe ../recipes/uni_80sparse_freq0.01_18prune10fine.md \ + --recipe ../recipes/bert-base-12layers_prune80.md \ --onnx_export_path MODELS_DIR/sparse80/onnx \ --report_to wandb ``` diff --git a/integrations/huggingface-transformers/recipes/bert-base-12layers_prune90.md b/integrations/huggingface-transformers/recipes/bert-base-12layers_prune90.md index 0518ee0dcec..778ec08020a 100644 --- a/integrations/huggingface-transformers/recipes/bert-base-12layers_prune90.md +++ b/integrations/huggingface-transformers/recipes/bert-base-12layers_prune90.md @@ -15,17 +15,17 @@ limitations under the License. --> --- -# General variables +# General Variables num_epochs: &num_epochs 30 -# pruning hyperparameters +# Pruning Hyperparameters init_sparsity: &init_sparsity 0.00 final_sparsity: &final_sparsity 0.90 pruning_start_epoch: &pruning_start_epoch 2 pruning_end_epoch: &pruning_end_epoch 20 update_frequency: &pruning_update_frequency 0.01 -# modifiers: +# Modifiers training_modifiers: - !EpochRangeModifier end_epoch: 30 @@ -34,12 +34,12 @@ training_modifiers: pruning_modifiers: - !GMPruningModifier params: - - re:bert.encoder.layer.([0,2,4,6,8]|11).attention.self.query.weight - - re:bert.encoder.layer.([0,2,4,6,8]|11).attention.self.key.weight - - re:bert.encoder.layer.([0,2,4,6,8]|11).attention.self.value.weight - - re:bert.encoder.layer.([0,2,4,6,8]|11).attention.output.dense.weight - - re:bert.encoder.layer.([0,2,4,6,8]|11).intermediate.dense.weight - - re:bert.encoder.layer.([0,2,4,6,8]|11).output.dense.weight + - re:bert.encoder.layer.*.attention.self.query.weight + - re:bert.encoder.layer.*.attention.self.key.weight + - re:bert.encoder.layer.*.attention.self.value.weight + - re:bert.encoder.layer.*.attention.output.dense.weight + - re:bert.encoder.layer.*.intermediate.dense.weight + - re:bert.encoder.layer.*.output.dense.weight start_epoch: *pruning_start_epoch end_epoch: *pruning_end_epoch init_sparsity: *init_sparsity @@ -50,21 +50,21 @@ pruning_modifiers: mask_type: unstructured log_types: __ALL__ --- -# Bert model with pruned encoder layers +# BERT Model with Pruned Encoder Layers -This recipe defines a pruning strategy to sparsify all encoder layers of a Bert model at 90% sparsity. It was used together with knowledge distillation to create sparse model that achives 98.4% recovery from its baseline accuracy on the Squad dataset. -Training was done using 1 GPU at half precision using a training batch size of 16 with the +This recipe defines a pruning strategy to sparsify all encoder layers of a BERT model at 90% sparsity. It was used together with knowledge distillation to create sparse model that achieves 98.4% recovery from the F1 metric (88.596) of the baseline model on the SQuAD dataset. (We use the checkpoint at the end of the first 2 epochs as the baseline model for comparison, right before the pruning takes effect.) +Training was done using one V100 GPU at half precision using a training batch size of 16 with the [SparseML integration with huggingface/transformers](https://github.com/neuralmagic/sparseml/tree/main/integrations/huggingface-transformers). ## Weights and Biases -- [Sparse Bert on Squad](https://wandb.ai/neuralmagic/sparse-bert-squad/runs/2ht2eqsn?workspace=user-neuralmagic) +- [Sparse BERT on SQuAD](https://wandb.ai/neuralmagic/sparse-bert-squad/runs/2ht2eqsn?workspace=user-neuralmagic) ## Training To set up the training environment, follow the instructions on the [integration README](https://github.com/neuralmagic/sparseml/blob/main/integrations/huggingface-transformers/README.md). Using the `run_qa.py` script from the question-answering examples, the following command can be used to launch this recipe with distillation. -Adjust the training command below with your setup for GPU device, checkpoint saving frequency and logging options. +Adjust the training command below with your setup for GPU device, checkpoint saving frequency, and logging options. *training command* ``` @@ -89,7 +89,7 @@ python transformers/examples/pytorch/question-answering/run_qa.py \ --distill_temperature 2.0 \ --save_steps 1000 \ --save_total_limit 2 \ - --recipe ../recipes/uni_90sparse_freq0.01_18prune10fine.md \ + --recipe ../recipes/bert-base-12layers_prune90.md \ --onnx_export_path MODELS_DIR/sparse90/onnx \ --report_to wandb ``` diff --git a/integrations/huggingface-transformers/recipes/bert-base-12layers_prune95.md b/integrations/huggingface-transformers/recipes/bert-base-12layers_prune95.md index 65fd9a7c121..b3b4e5e6516 100644 --- a/integrations/huggingface-transformers/recipes/bert-base-12layers_prune95.md +++ b/integrations/huggingface-transformers/recipes/bert-base-12layers_prune95.md @@ -15,17 +15,17 @@ limitations under the License. --> --- -# General variables +# General Variables num_epochs: &num_epochs 30 -# pruning hyperparameters +# Pruning Hyperparameters init_sparsity: &init_sparsity 0.00 final_sparsity: &final_sparsity 0.95 pruning_start_epoch: &pruning_start_epoch 2 pruning_end_epoch: &pruning_end_epoch 20 update_frequency: &pruning_update_frequency 0.01 -# modifiers: +# Modifiers training_modifiers: - !EpochRangeModifier end_epoch: 30 @@ -34,12 +34,12 @@ training_modifiers: pruning_modifiers: - !GMPruningModifier params: - - re:bert.encoder.layer.([0,2,4,6,8]|11).attention.self.query.weight - - re:bert.encoder.layer.([0,2,4,6,8]|11).attention.self.key.weight - - re:bert.encoder.layer.([0,2,4,6,8]|11).attention.self.value.weight - - re:bert.encoder.layer.([0,2,4,6,8]|11).attention.output.dense.weight - - re:bert.encoder.layer.([0,2,4,6,8]|11).intermediate.dense.weight - - re:bert.encoder.layer.([0,2,4,6,8]|11).output.dense.weight + - re:bert.encoder.layer.*.attention.self.query.weight + - re:bert.encoder.layer.*.attention.self.key.weight + - re:bert.encoder.layer.*.attention.self.value.weight + - re:bert.encoder.layer.*.attention.output.dense.weight + - re:bert.encoder.layer.*.intermediate.dense.weight + - re:bert.encoder.layer.*.output.dense.weight start_epoch: *pruning_start_epoch end_epoch: *pruning_end_epoch init_sparsity: *init_sparsity @@ -51,21 +51,21 @@ pruning_modifiers: log_types: __ALL__ --- -# Bert model with pruned encoder layers +# BERT Model with Pruned Encoder Layers -This recipe defines a pruning strategy to sparsify all encoder layers of a Bert model at 95% sparsity. It was used together with knowledge distillation to create sparse model that achives 94.7% recovery from its baseline accuracy on the Squad dataset. +This recipe defines a pruning strategy to sparsify all encoder layers of a BERT model at 95% sparsity. It was used together with knowledge distillation to create sparse model that achieves 94.7% recovery from the F1 metric of the baseline model on the SQuAD dataset. (We use the checkpoint at the end of the first 2 epochs as the baseline model for comparison, right before the pruning takes effect.) Training was done using 1 GPU at half precision using a training batch size of 16 with the [SparseML integration with huggingface/transformers](https://github.com/neuralmagic/sparseml/tree/main/integrations/huggingface-transformers). ## Weights and Biases -- [Sparse Bert on Squad](https://wandb.ai/neuralmagic/sparse-bert-squad/runs/3gv0arxd?workspace=user-neuralmagic) +- [Sparse BERT on SQuAD](https://wandb.ai/neuralmagic/sparse-bert-squad/runs/3gv0arxd?workspace=user-neuralmagic) ## Training To set up the training environment, follow the instructions on the [integration README](https://github.com/neuralmagic/sparseml/blob/main/integrations/huggingface-transformers/README.md). Using the `run_qa.py` script from the question-answering examples, the following command can be used to launch this recipe with distillation. -Adjust the training command below with your setup for GPU device, checkpoint saving frequency and logging options. +Adjust the training command below with your setup for GPU device, checkpoint saving frequency, and logging options. *training command* ``` @@ -90,7 +90,7 @@ python transformers/examples/pytorch/question-answering/run_qa.py \ --distill_temperature 2.0 \ --save_steps 1000 \ --save_total_limit 2 \ - --recipe ../recipes/uni_95sparse_freq0.01_18prune10fine.md \ + --recipe ../recipes/bert-base-12layers_prune95.md \ --onnx_export_path MODELS_DIR/sparse95/onnx \ --report_to wandb ``` diff --git a/integrations/huggingface-transformers/recipes/bert-base-6layers_prune80.md b/integrations/huggingface-transformers/recipes/bert-base-6layers_prune80.md index 7e81ed98059..235574b81c4 100644 --- a/integrations/huggingface-transformers/recipes/bert-base-6layers_prune80.md +++ b/integrations/huggingface-transformers/recipes/bert-base-6layers_prune80.md @@ -15,17 +15,17 @@ limitations under the License. --> --- -# General variables +# General Variables num_epochs: &num_epochs 30 -# pruning hyperparameters +# Pruning Hyperparameters init_sparsity: &init_sparsity 0.00 final_sparsity: &final_sparsity 0.80 pruning_start_epoch: &pruning_start_epoch 2 pruning_end_epoch: &pruning_end_epoch 20 update_frequency: &pruning_update_frequency 0.01 -# modifiers: +# Modifiers training_modifiers: - !EpochRangeModifier end_epoch: 30 @@ -60,21 +60,21 @@ pruning_modifiers: - bert.encoder.layer.10 --- -# Bert model with dropped and pruned encoder layers +# BERT Model with Dropped and Pruned Encoder Layers -This recipe defines a dropping and pruning strategy to sparsify 6 encoder layers of a Bert model at 80% sparsity. It was used together with knowledge distillation to create sparse model that achives 97% recovery from its (teacher) baseline accuracy on the Squad dataset. -Training was done using 1 GPU at half precision using a training batch size of 16 with the +This recipe defines a dropping and pruning strategy to sparsify six encoder layers of a BERT model at 80% sparsity. It was used together with knowledge distillation to create sparse model that exceeds the F1 metric (83.632) of the baseline model by 0.02% on the SQuAD dataset. (We use the checkpoint at the end of the first 2 epochs as the baseline model for comparison, right before the pruning takes effect.) +Training was done using one V100 GPU at half precision using a training batch size of 16 with the [SparseML integration with huggingface/transformers](https://github.com/neuralmagic/sparseml/tree/main/integrations/huggingface-transformers). ## Weights and Biases -- [Sparse Bert on Squad](https://wandb.ai/neuralmagic/sparse-bert-squad/runs/ebab4np4?workspace=user-neuralmagic) +- [Sparse BERT on SQuAD](https://wandb.ai/neuralmagic/sparse-bert-squad/runs/ebab4np4?workspace=user-neuralmagic) ## Training To set up the training environment, follow the instructions on the [integration README](https://github.com/neuralmagic/sparseml/blob/main/integrations/huggingface-transformers/README.md). Using the `run_qa.py` script from the question-answering examples, the following command can be used to launch this recipe with distillation. -Adjust the training command below with your setup for GPU device, checkpoint saving frequency and logging options. +Adjust the training command below with your setup for GPU device, checkpoint saving frequency, and logging options. *training command* ``` @@ -99,7 +99,7 @@ python transformers/examples/pytorch/question-answering/run_qa.py \ --distill_temperature 2.0 \ --save_steps 1000 \ --save_total_limit 2 \ - --recipe ../recipes/uni_80sparse_freq0.01_18prune10fine_6layers.md \ + --recipe ../recipes/bert-base-6layers_prune80.md \ --onnx_export_path MODELS_DIR/sparse80_6layers/onnx \ --report_to wandb ``` diff --git a/integrations/huggingface-transformers/recipes/bert-base-6layers_prune90.md b/integrations/huggingface-transformers/recipes/bert-base-6layers_prune90.md index 12ace7a668c..5a1bda3c294 100644 --- a/integrations/huggingface-transformers/recipes/bert-base-6layers_prune90.md +++ b/integrations/huggingface-transformers/recipes/bert-base-6layers_prune90.md @@ -15,17 +15,17 @@ limitations under the License. --> --- -# General Epoch/LR variables +# General Variables num_epochs: &num_epochs 30 -# pruning hyperparameters +# Pruning Hyperparameters init_sparsity: &init_sparsity 0.00 final_sparsity: &final_sparsity 0.90 pruning_start_epoch: &pruning_start_epoch 2 pruning_end_epoch: &pruning_end_epoch 20 update_frequency: &pruning_update_frequency 0.01 -# modifiers: +# Modifiers training_modifiers: - !EpochRangeModifier end_epoch: 30 @@ -60,21 +60,21 @@ pruning_modifiers: - bert.encoder.layer.10 --- -# Bert model with dropped and pruned encoder layers +# BERT Model with Dropped and Pruned Encoder Layers -This recipe defines a dropping and pruning strategy to sparsify 6 encoder layers of a Bert model at 90% sparsity. It was used together with knowledge distillation to create sparse model that achives 94.5% recovery from its (teacher) baseline accuracy on the Squad dataset. -Training was done using 1 GPU at half precision using a training batch size of 16 with the +This recipe defines a dropping and pruning strategy to sparsify six encoder layers of a BERT model at 90% sparsity. It was used together with knowledge distillation to create sparse model that achieves 99.9% recovery from the F1 metric (83.632) of the baseline model on the SQuAD dataset. (We use the checkpoint at the end of the first 2 epochs as the baseline model for comparison, right before the pruning takes effect.) +Training was done using one V100 GPU at half precision using a training batch size of 16 with the [SparseML integration with huggingface/transformers](https://github.com/neuralmagic/sparseml/tree/main/integrations/huggingface-transformers). ## Weights and Biases -- [Sparse Bert on Squad](https://wandb.ai/neuralmagic/sparse-bert-squad/runs/3qvxoroz?workspace=user-neuralmagic) +- [Sparse BERT on SQuAD](https://wandb.ai/neuralmagic/sparse-bert-squad/runs/3qvxoroz?workspace=user-neuralmagic) ## Training To set up the training environment, follow the instructions on the [integration README](https://github.com/neuralmagic/sparseml/blob/main/integrations/huggingface-transformers/README.md). Using the `run_qa.py` script from the question-answering examples, the following command can be used to launch this recipe with distillation. -Adjust the training command below with your setup for GPU device, checkpoint saving frequency and logging options. +Adjust the training command below with your setup for GPU device, checkpoint saving frequency, and logging options. *training command* ``` @@ -99,7 +99,7 @@ python transformers/examples/pytorch/question-answering/run_qa.py \ --distill_temperature 2.0 \ --save_steps 1000 \ --save_total_limit 2 \ - --recipe ../recipes/uni_90sparse_freq0.01_18prune10fine_6layers.md \ + --recipe ../recipes/bert-base-6layers_prune90.md \ --onnx_export_path MODELS_DIR/sparse90_6layers/onnx \ --report_to wandb ``` diff --git a/integrations/huggingface-transformers/recipes/bert-base-6layers_prune95.md b/integrations/huggingface-transformers/recipes/bert-base-6layers_prune95.md index a7291986147..c46d0d15001 100644 --- a/integrations/huggingface-transformers/recipes/bert-base-6layers_prune95.md +++ b/integrations/huggingface-transformers/recipes/bert-base-6layers_prune95.md @@ -15,17 +15,17 @@ limitations under the License. --> --- -# General Epoch/LR variables +# General Variables num_epochs: &num_epochs 30 -# pruning hyperparameters +# Pruning Hyperparameters init_sparsity: &init_sparsity 0.00 final_sparsity: &final_sparsity 0.95 pruning_start_epoch: &pruning_start_epoch 2 pruning_end_epoch: &pruning_end_epoch 20 update_frequency: &pruning_update_frequency 0.01 -# modifiers: +# Modifiers training_modifiers: - !EpochRangeModifier end_epoch: 30 @@ -60,21 +60,21 @@ pruning_modifiers: - bert.encoder.layer.10 --- -# Bert model with dropped and pruned encoder layers +# BERT Model with Dropped and Pruned Encoder Layers -This recipe defines a dropping and pruning strategy to sparsify 6 encoder layers of a Bert model at 95% sparsity. It was used together with knowledge distillation to create sparse model that achives 90% recovery from its (teacher) baseline accuracy on the Squad dataset. -Training was done using 1 GPU at half precision using a training batch size of 16 with the +This recipe defines a dropping and pruning strategy to sparsify six encoder layers of a BERT model at 95% sparsity. It was used together with knowledge distillation to create sparse model that achives 96.2% recovery from the F1 metric of the baseline model on the SQuAD dataset. (We use the checkpoint at the end of the first 2 epochs as the baseline model for comparison, right before the pruning takes effect.) +Training was done using one V100 GPU at half precision using a training batch size of 16 with the [SparseML integration with huggingface/transformers](https://github.com/neuralmagic/sparseml/tree/main/integrations/huggingface-transformers). ## Weights and Biases -- [Sparse Bert on Squad](https://wandb.ai/neuralmagic/sparse-bert-squad/runs/3plynclw?workspace=user-neuralmagic) +- [Sparse BERT on SQuAD](https://wandb.ai/neuralmagic/sparse-bert-squad/runs/3plynclw?workspace=user-neuralmagic) ## Training To set up the training environment, follow the instructions on the [integration README](https://github.com/neuralmagic/sparseml/blob/main/integrations/huggingface-transformers/README.md). Using the `run_qa.py` script from the question-answering examples, the following command can be used to launch this recipe with distillation. -Adjust the training command below with your setup for GPU device, checkpoint saving frequency and logging options. +Adjust the training command below with your setup for GPU device, checkpoint saving frequency, and logging options. *training command* ``` @@ -99,7 +99,7 @@ python transformers/examples/pytorch/question-answering/run_qa.py \ --distill_temperature 2.0 \ --save_steps 1000 \ --save_total_limit 2 \ - --recipe ../recipes/uni_95sparse_freq0.01_18prune10fine_6layers.md \ + --recipe ../recipes/bert-base-6layers_prune95.md \ --onnx_export_path MODELS_DIR/sparse95_6layers/onnx \ --report_to wandb ``` diff --git a/integrations/huggingface-transformers/setup_integration.sh b/integrations/huggingface-transformers/setup_integration.sh index d3ad9d377fd..7a851c50893 100644 --- a/integrations/huggingface-transformers/setup_integration.sh +++ b/integrations/huggingface-transformers/setup_integration.sh @@ -4,6 +4,7 @@ # Creates a transformers folder next to this script with all required dependencies from the huggingface/transformers repository. # Command: `bash setup_integration.sh` -git clone https://github.com/huggingface/transformers.git +git clone https://github.com/neuralmagic/transformers.git cd transformers pip install -e . +pip install datasets diff --git a/integrations/huggingface-transformers/tutorials/images/bert_12_6_layers_EM.png b/integrations/huggingface-transformers/tutorials/images/bert_12_6_layers_EM.png new file mode 100644 index 00000000000..3fedade6fd0 Binary files /dev/null and b/integrations/huggingface-transformers/tutorials/images/bert_12_6_layers_EM.png differ diff --git a/integrations/huggingface-transformers/tutorials/images/bert_12_6_layers_F1.png b/integrations/huggingface-transformers/tutorials/images/bert_12_6_layers_F1.png new file mode 100644 index 00000000000..d4c45f18402 Binary files /dev/null and b/integrations/huggingface-transformers/tutorials/images/bert_12_6_layers_F1.png differ diff --git a/integrations/huggingface-transformers/tutorials/sparsifying_bert_using_recipes.md b/integrations/huggingface-transformers/tutorials/sparsifying_bert_using_recipes.md new file mode 100644 index 00000000000..1f07f4ab322 --- /dev/null +++ b/integrations/huggingface-transformers/tutorials/sparsifying_bert_using_recipes.md @@ -0,0 +1,155 @@ + + +# Sparsifying BERT Models Using Recipes + +This tutorial presents an essential extension from SparseML to Hugging Face Transformers training workflow to support model sparsification that includes knowledge distillation, parameter pruning and layer dropping. The examples used in this tutorial are specifically for BERT base uncased models, trained and pruned on the SQuAD dataset; further support and results will be available for other datasets in the near future. + +## Overview +Neural Magic’s ML team creates recipes that allow anyone to plug in their data and leverage SparseML’s recipe-driven approach on top of Hugging Face’s robust training pipelines. Sparsifying involves removing redundant information from neural networks using algorithms such as pruning and quantization, among others. This sparsification process results in many benefits for deployment environments, including faster inference and smaller file sizes. Unfortunately, many have not realized the benefits due to the complicated process and number of hyperparameters involved. + +Working through this tutorial, you will experience how Neural Magic recipes simplify the sparsification process by: + +- Creating a pre-trained teacher model for knowledge distillation. + +- Applying a recipe to select the trade off between the amount of recovery to the baseline training performance with the amount of sparsification for inference performance. + +- Exporting a pruned model to the ONNX format to run with an inference engine such as DeepSparse. + +All the results listed in this tutorials are available publically through a [Weights and Biases project](https://wandb.ai/neuralmagic/sparse-bert-squad?workspace=user-neuralmagic). + +
+
+
+