From 939e389b3c2f12e65c1e0467e21908080f8b9759 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Thu, 3 Jun 2021 01:56:04 -0400 Subject: [PATCH 1/7] Initial commit: QA and Distill QA with SparseML integ --- .../recipes/finetune_squad_2epochs.yaml | 6 + ..._80blocksparse_freq0.01_18prune10fine.yaml | 871 ++++++++++++++++++ .../question-answering/run_distill_qa.py | 791 ++++++++++++++++ examples/pytorch/question-answering/run_qa.py | 133 ++- .../question-answering/sparseml_utils.py | 195 ++++ 5 files changed, 1993 insertions(+), 3 deletions(-) create mode 100644 examples/pytorch/question-answering/recipes/finetune_squad_2epochs.yaml create mode 100644 examples/pytorch/question-answering/recipes/uni_80blocksparse_freq0.01_18prune10fine.yaml create mode 100755 examples/pytorch/question-answering/run_distill_qa.py create mode 100644 examples/pytorch/question-answering/sparseml_utils.py diff --git a/examples/pytorch/question-answering/recipes/finetune_squad_2epochs.yaml b/examples/pytorch/question-answering/recipes/finetune_squad_2epochs.yaml new file mode 100644 index 000000000000..8529afa2f4f6 --- /dev/null +++ b/examples/pytorch/question-answering/recipes/finetune_squad_2epochs.yaml @@ -0,0 +1,6 @@ +version: 1.1.0 + +modifiers: + - !EpochRangeModifier + end_epoch: 2 + start_epoch: 0.0 \ No newline at end of file diff --git a/examples/pytorch/question-answering/recipes/uni_80blocksparse_freq0.01_18prune10fine.yaml b/examples/pytorch/question-answering/recipes/uni_80blocksparse_freq0.01_18prune10fine.yaml new file mode 100644 index 000000000000..1673f5441e49 --- /dev/null +++ b/examples/pytorch/question-answering/recipes/uni_80blocksparse_freq0.01_18prune10fine.yaml @@ -0,0 +1,871 @@ +version: 1.1.0 + +modifiers: + - !EpochRangeModifier + end_epoch: 30 + start_epoch: 0.0 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.0.attention.self.query.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.0.attention.self.key.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.0.attention.self.value.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.0.attention.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.0.intermediate.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.0.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.1.attention.self.query.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.1.attention.self.key.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.1.attention.self.value.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.1.attention.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.1.intermediate.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.1.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.2.attention.self.query.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.2.attention.self.key.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.2.attention.self.value.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.2.attention.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.2.intermediate.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.2.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.3.attention.self.query.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.3.attention.self.key.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.3.attention.self.value.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.3.attention.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.3.intermediate.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.3.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.4.attention.self.query.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.4.attention.self.key.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.4.attention.self.value.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.4.attention.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.4.intermediate.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.4.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.5.attention.self.query.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.5.attention.self.key.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.5.attention.self.value.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.5.attention.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.5.intermediate.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.5.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.6.attention.self.query.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.6.attention.self.key.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.6.attention.self.value.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.6.attention.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.6.intermediate.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.6.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.7.attention.self.query.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.7.attention.self.key.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.7.attention.self.value.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.7.attention.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.7.intermediate.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.7.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.8.attention.self.query.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.8.attention.self.key.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.8.attention.self.value.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.8.attention.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.8.intermediate.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.8.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.9.attention.self.query.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.9.attention.self.key.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.9.attention.self.value.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.9.attention.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.9.intermediate.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.9.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.10.attention.self.query.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.10.attention.self.key.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.10.attention.self.value.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.10.attention.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.10.intermediate.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.10.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.11.attention.self.query.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.11.attention.self.key.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.11.attention.self.value.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.11.attention.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.11.intermediate.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + + - !GMPruningModifier + end_epoch: 20 + final_sparsity: 0.80 + init_sparsity: 0.00 + inter_func: cubic + leave_enabled: True + log_types: __ALL__ + mask_type: [1,4] + params: ['bert.encoder.layer.11.output.dense.weight'] + start_epoch: 2 + update_frequency: 0.01 + diff --git a/examples/pytorch/question-answering/run_distill_qa.py b/examples/pytorch/question-answering/run_distill_qa.py new file mode 100755 index 000000000000..e1009ee67ec4 --- /dev/null +++ b/examples/pytorch/question-answering/run_distill_qa.py @@ -0,0 +1,791 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Team All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Example script for integrating spaseml with the transformers library to perform model distillation. +This script is addopted from hugging face's implementation for Question Answering on the SQUAD Dataset. +Hugging Face's original implementation is regularly updated and can be found at https://github.com/huggingface/transformers/blob/master/examples/question-answering/run_qa.py +This script will: +- Load transformer based models +- Load a sparseml training and pruning optimizer +- Train on SQUAD +- Evaluate on SQUAD +- Export model to onnx. +########## +Command help: +usage: run_distill_qa.py [-h] \ + [--teacher_model_name_or_path] \ + [--student_model_name_or_path] \ + [--temperature] \ + [--distill_hardness] \ + [--dataset_name] \ + [--num_train_epochs] \ + [--do_train] \ + [--do_eval] \ + [--per_device_train_batch_size] \ + [--per_device_eval_batch_size] \ + [--learning_rate]\ + [--max_seq_length]\ + [--doc_stride]\ + [--output_dir] \ + [--overwrite_output_dir] \ + [--cache_dir]\ + [--preprocessing_num_workers] \ + [--seed] 42 \ + [--nm_prune_config] \ + [--do_onnx_export] \ + [--onnx_export_path] \ + [--layers_to_keep] \ + +Train, prune, and evaluate a transformer base question answering model on squad. + -h, --help show this help message and exit + --teacher_model_name_or_path The name or path of model which will be used for distilation. + Note, this model needs to be trained for QA task already. + --student_model_name_or_path The path to the transformers model you wish to train + or the name of the pretrained language model you wish + to use. ex: bert-base-uncased. + --temperature Hyperparameter which controls model distilation + --distill_hardness Hyperparameter which controls how much of the loss comes from teacher vs training labels + --model_name_or_path The path to the transformers model you wish to train + --temperature Hyperparameter which controls model distilation + --distill_hardness Hyperparameter which controls how much of the loss comes from teacher vs training labels + --dataset_name The name of which dataset you want to use to train or + your model. ex: squad for using SQuAD. + --num_train_epochs Paramater to control how many training epochs you wish + your model to train. + --do_train Boolean denoting if the model should be trained + or not. Default is false. + --do_eval Boolean denoting if the model should be evaluated + or not. Default is false. + --per_device_train_batch_size Size of each training batch based on samples per GPU. + 12 will fit in a 11gb GPU, 16 in a 16gb. + --per_device_eval_batch_size Size of each training batch based on samples per GPU. + 12 will fit in a 11gb GPU, 16 in a 16gb. + --learning_rate Learning rate initial float value. ex: 3e-5. + --max_seq_length Int for the max sequence length to be parsed as a context + window. ex: 384 tokens. + --output_dir Path which model checkpoints and paths should be saved. + --overwrite_output_dir Boolean to define if the + --cache_dir Directiory which cached transformer files(datasets, models + , tokenizers) are saved for fast loading. + --preprocessing_num_workers The amount of cpu workers which are used to process datasets + --seed Int which determines what random seed is for training/shuffling + --nm_prune_config Path to the neural magic prune configuration file. examples can + be found in prune_config_files but are customized for bert-base-uncased. + --do_onnx_export Boolean denoting if the model should be exported to onnx + --onnx_export_path Path where onnx model path will be exported. ex: onnx-export + --layers_to_keep Number of layers to keep from original model. Layers are dropped before training + +########## +Example command for training a 95% sparse BERT SQUAD model for 1 epoch with a unpruned teacher: +python run_distill_qa.py \ + --teacher_model_name_or_path models/neuralmagic-bert-squad-12layer-0sparse + --student_model_name_or_path bert-base-uncased \ + --dataset_name squad \ + --num_train_epochs 1 \ + --do_train \ + --do_eval \ + --per_device_train_batch_size 12 \ + --per_device_eval_batch_size 12 \ + --learning_rate 3e-5 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --output_dir 95sparsity1epoch/ \ + --overwrite_output_dir \ + --cache_dir cache \ + --preprocessing_num_workers 8 \ + --seed 42 \ + --nm_prune_config prune_config_files/95sparsity1epoch.yaml \ + --do_onnx_export \ + --onnx_export_path 95sparsity1epoch/ \ + --distill_hardness 0.5 \ + --temperature 2.0 \ +""" +# You can also adapt this script on your own question answering task. Pointers for this are left as comments. + +import logging +import os +import numpy as np +import sys +from dataclasses import dataclass, field +from typing import Optional + +from datasets import load_dataset, load_metric + +import transformers +from transformers import ( + AutoConfig, + AutoModelForQuestionAnswering, + AutoTokenizer, + DataCollatorWithPadding, + EvalPrediction, + HfArgumentParser, + PreTrainedTokenizerFast, + TrainingArguments, + default_data_collator, + set_seed, +) +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version +from utils_qa import postprocess_qa_predictions + +# Start SparseML integration +from sparseml_utils import SparseMLDistillQATrainer, convert_example_to_features +from sparseml.pytorch.utils import ModuleExporter +# End SparseML integration + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.7.0.dev0") + +logger = logging.getLogger(__name__) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + teacher_model_name_or_path: Optional[str] = field( + default="spacemanidol/neuralmagic-bert-squad-12layer-0sparse", metadata={"help": "Teacher model which needs to be a trained QA model"} + ) + student_model_name_or_path: Optional[str] = field( + default="bert-base-uncased", metadata={"help": "Student model"} + ) + temperature: Optional[float] = field( + default=2.0, metadata={"help": "Temperature applied to teacher softmax for distillation."} + ) + distill_hardness: Optional[float] = field( + default=1.0, metadata={"help": "Proportion of loss coming from teacher model."} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + #################################################################################### + # Start SparseML Integration + #################################################################################### + nm_prune_config: Optional[str] = field( + default="recipes/noprune1epoch.yaml", + metadata={"help": "The input file name for the Neural Magic pruning config"}, + ) + do_onnx_export: bool = field(default=True, metadata={"help": "Export model to onnx"}) + onnx_export_path: Optional[str] = field( + default="onnx-export", metadata={"help": "The filename and path which will be where onnx model is outputed"} + ) + #################################################################################### + # End SparseML Integration + #################################################################################### + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + test_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."}, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + max_seq_length: int = field( + default=384, + metadata={ + "help": "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + pad_to_max_length: bool = field( + default=True, + metadata={ + "help": "Whether to pad all samples to `max_seq_length`. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can " + "be faster on GPU but will be slower on TPU)." + }, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + }, + ) + max_predict_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " + "value if set." + }, + ) + version_2_with_negative: bool = field( + default=False, metadata={"help": "If true, some of the examples do not have an answer."} + ) + null_score_diff_threshold: float = field( + default=0.0, + metadata={ + "help": "The threshold used to select the null answer: if the best answer has a score that is less than " + "the score of the null answer minus this threshold, the null answer is selected for this example. " + "Only useful when `version_2_with_negative=True`." + }, + ) + doc_stride: int = field( + default=128, + metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."}, + ) + n_best_size: int = field( + default=20, + metadata={"help": "The total number of n-best predictions to generate when looking for an answer."}, + ) + max_answer_length: int = field( + default=30, + metadata={ + "help": "The maximum length of an answer that can be generated. This is needed because the start " + "and end predictions are not conditioned on one another." + }, + ) + + def __post_init__(self): + if ( + self.dataset_name is None + and self.train_file is None + and self.validation_file is None + and self.test_file is None + ): + raise ValueError("Need either a dataset name or a training/validation file/test_file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json"], "`train_file` should be a csv or a json file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." + if self.test_file is not None: + extension = self.test_file.split(".")[-1] + assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + logger.setLevel(logging.INFO if training_args.should_log else logging.WARN) + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + # Set the verbosity to info of the Transformers logger (on main process only): + if training_args.should_log: + transformers.utils.logging.set_verbosity_info() + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + logger.info(f"Training/evaluation parameters {training_args}") + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) + else: + data_files = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] + + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = data_args.validation_file.split(".")[-1] + if data_args.test_file is not None: + data_files["test"] = data_args.test_file + extension = data_args.test_file.split(".")[-1] + datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.student_model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.student_model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=True, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + student_model = AutoModelForQuestionAnswering.from_pretrained( + model_args.student_model_name_or_path, + from_tf=bool(".ckpt" in model_args.student_model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + teacher_model = None + if model_args.teacher_model_name_or_path != None: + teacher_model = AutoModelForQuestionAnswering.from_pretrained( + model_args.teacher_model_name_or_path, + from_tf=bool(".ckpt" in model_args.teacher_model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + ) + teacher_model_parameters = filter(lambda p: p.requires_grad, teacher_model.parameters()) + params = sum([np.prod(p.size()) for p in teacher_model_parameters]) + logger.info("Teacher Model has %s parameters", params) + + student_model_parameters = filter(lambda p: p.requires_grad, student_model.parameters()) + params = sum([np.prod(p.size()) for p in student_model_parameters]) + logger.info("Student Model has %s parameters", params) + + # Tokenizer check: this script requires a fast tokenizer. + if not isinstance(tokenizer, PreTrainedTokenizerFast): + raise ValueError( + "This example script only works for models that have a fast tokenizer. Checkout the big table of models " + "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this " + "requirement" + ) + + # Preprocessing the datasets. + # Preprocessing is slighlty different for training and evaluation. + if training_args.do_train: + column_names = datasets["train"].column_names + elif training_args.do_eval: + column_names = datasets["validation"].column_names + else: + column_names = datasets["test"].column_names + question_column_name = "question" if "question" in column_names else column_names[0] + context_column_name = "context" if "context" in column_names else column_names[1] + answer_column_name = "answers" if "answers" in column_names else column_names[2] + + # Padding side determines if we do (question|context) or (context|question). + pad_on_right = tokenizer.padding_side == "right" + + if data_args.max_seq_length > tokenizer.model_max_length: + logger.warning( + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." + ) + max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) + + # Training preprocessing + def prepare_train_features(examples): + # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results + # in one example possible giving several features when a context is long, each of those features having a + # context that overlaps a bit the context of the previous feature. + tokenized_examples = tokenizer( + examples[question_column_name if pad_on_right else context_column_name], + examples[context_column_name if pad_on_right else question_column_name], + truncation="only_second" if pad_on_right else "only_first", + max_length=max_seq_length, + stride=data_args.doc_stride, + return_overflowing_tokens=True, + return_offsets_mapping=True, + padding="max_length" if data_args.pad_to_max_length else False, + ) + + # Since one example might give us several features if it has a long context, we need a map from a feature to + # its corresponding example. This key gives us just that. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + # The offset mappings will give us a map from token to character position in the original context. This will + # help us compute the start_positions and end_positions. + offset_mapping = tokenized_examples.pop("offset_mapping") + + # Let's label those examples! + tokenized_examples["start_positions"] = [] + tokenized_examples["end_positions"] = [] + + for i, offsets in enumerate(offset_mapping): + # We will label impossible answers with the index of the CLS token. + input_ids = tokenized_examples["input_ids"][i] + cls_index = input_ids.index(tokenizer.cls_token_id) + + # Grab the sequence corresponding to that example (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + + # One example can give several spans, this is the index of the example containing this span of text. + sample_index = sample_mapping[i] + answers = examples[answer_column_name][sample_index] + # If no answers are given, set the cls_index as answer. + if len(answers["answer_start"]) == 0: + tokenized_examples["start_positions"].append(cls_index) + tokenized_examples["end_positions"].append(cls_index) + else: + # Start/end character index of the answer in the text. + start_char = answers["answer_start"][0] + end_char = start_char + len(answers["text"][0]) + + # Start token index of the current span in the text. + token_start_index = 0 + while sequence_ids[token_start_index] != (1 if pad_on_right else 0): + token_start_index += 1 + + # End token index of the current span in the text. + token_end_index = len(input_ids) - 1 + while sequence_ids[token_end_index] != (1 if pad_on_right else 0): + token_end_index -= 1 + + # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). + if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): + tokenized_examples["start_positions"].append(cls_index) + tokenized_examples["end_positions"].append(cls_index) + else: + # Otherwise move the token_start_index and token_end_index to the two ends of the answer. + # Note: we could go after the last offset if the answer is the last word (edge case). + while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char: + token_start_index += 1 + tokenized_examples["start_positions"].append(token_start_index - 1) + while offsets[token_end_index][1] >= end_char: + token_end_index -= 1 + tokenized_examples["end_positions"].append(token_end_index + 1) + + return tokenized_examples + + if training_args.do_train: + if "train" not in datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = datasets["train"] + if data_args.max_train_samples is not None: + # We will select sample from whole data if agument is specified + train_dataset = train_dataset.select(range(data_args.max_train_samples)) + # Create train feature from dataset + train_dataset = train_dataset.map( + prepare_train_features, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + ) + if data_args.max_train_samples is not None: + # Number of samples might increase during Feature Creation, We select only specified max samples + train_dataset = train_dataset.select(range(data_args.max_train_samples)) + + # Validation preprocessing + def prepare_validation_features(examples): + # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results + # in one example possible giving several features when a context is long, each of those features having a + # context that overlaps a bit the context of the previous feature. + tokenized_examples = tokenizer( + examples[question_column_name if pad_on_right else context_column_name], + examples[context_column_name if pad_on_right else question_column_name], + truncation="only_second" if pad_on_right else "only_first", + max_length=max_seq_length, + stride=data_args.doc_stride, + return_overflowing_tokens=True, + return_offsets_mapping=True, + padding="max_length" if data_args.pad_to_max_length else False, + ) + + # Since one example might give us several features if it has a long context, we need a map from a feature to + # its corresponding example. This key gives us just that. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + + # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the + # corresponding example_id and we will store the offset mappings. + tokenized_examples["example_id"] = [] + + for i in range(len(tokenized_examples["input_ids"])): + # Grab the sequence corresponding to that example (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + context_index = 1 if pad_on_right else 0 + + # One example can give several spans, this is the index of the example containing this span of text. + sample_index = sample_mapping[i] + tokenized_examples["example_id"].append(examples["id"][sample_index]) + + # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token + # position is part of the context or not. + tokenized_examples["offset_mapping"][i] = [ + (o if sequence_ids[k] == context_index else None) + for k, o in enumerate(tokenized_examples["offset_mapping"][i]) + ] + + return tokenized_examples + + if training_args.do_eval: + if "validation" not in datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_examples = datasets["validation"] + if data_args.max_eval_samples is not None: + # We will select sample from whole data + eval_examples = eval_examples.select(range(data_args.max_eval_samples)) + # Validation Feature Creation + eval_dataset = eval_examples.map( + prepare_validation_features, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + ) + if data_args.max_eval_samples is not None: + # During Feature creation dataset samples might increase, we will select required samples again + eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + + if training_args.do_predict: + if "test" not in datasets: + raise ValueError("--do_predict requires a test dataset") + predict_examples = datasets["test"] + if data_args.max_predict_samples is not None: + # We will select sample from whole data + predict_examples = predict_examples.select(range(data_args.max_predict_samples)) + # Predict Feature Creation + predict_dataset = predict_examples.map( + prepare_validation_features, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + ) + if data_args.max_predict_samples is not None: + # During Feature creation dataset samples might increase, we will select required samples again + predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) + + # Data collator + # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data + # collator. + data_collator = ( + default_data_collator + if data_args.pad_to_max_length + else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) + ) + + # Post-processing: + def post_processing_function(examples, features, predictions, stage="eval"): + # Post-processing: we match the start logits and end logits to answers in the original context. + predictions = postprocess_qa_predictions( + examples=examples, + features=features, + predictions=predictions, + version_2_with_negative=data_args.version_2_with_negative, + n_best_size=data_args.n_best_size, + max_answer_length=data_args.max_answer_length, + null_score_diff_threshold=data_args.null_score_diff_threshold, + output_dir=training_args.output_dir, + is_world_process_zero=trainer.is_world_process_zero(), + prefix=stage, + ) + # Format the result to the format the metric expects. + if data_args.version_2_with_negative: + formatted_predictions = [ + {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() + ] + else: + formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] + + references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples] + return EvalPrediction(predictions=formatted_predictions, label_ids=references) + + metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad") + + def compute_metrics(p: EvalPrediction): + return metric.compute(predictions=p.predictions, references=p.label_ids) + + #################################################################################### + # Start SparseML Integration + #################################################################################### + # Initialize our Trainer + trainer = SparseMLDistillQATrainer( + data_args.nm_prune_config, + teacher=teacher_model, + distill_hardness = model_args.distill_hardness, + temperature = model_args.temperature, + model=student_model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + eval_examples=eval_examples if training_args.do_eval else None, + tokenizer=tokenizer, + data_collator=data_collator, + post_process_function=post_processing_function, + compute_metrics=compute_metrics, + ) + #################################################################################### + # End SparseML Integration + #################################################################################### + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + metrics = trainer.evaluate() + + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + # Prediction + if training_args.do_predict: + logger.info("*** Predict ***") + results = trainer.predict(predict_dataset, predict_examples) + metrics = results.metrics + + max_predict_samples = ( + data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset) + ) + metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) + + trainer.log_metrics("predict", metrics) + trainer.save_metrics("predict", metrics) + + if training_args.push_to_hub: + kwargs = {"finetuned_from": model_args.model_name_or_path, "tags": "question-answering"} + if data_args.dataset_name is not None: + kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + kwargs["dataset_args"] = data_args.dataset_config_name + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + kwargs["dataset"] = data_args.dataset_name + + trainer.push_to_hub(**kwargs) + + #################################################################################### + # Start SparseML Integration + #################################################################################### + if data_args.do_onnx_export: + logger.info("*** Export to ONNX ***") + os.environ["TOKENIZERS_PARALLELISM"] = "false" + exporter = ModuleExporter( + student_model, output_dir=data_args.onnx_export_path + ) + sample_batch = convert_example_to_features( + datasets["validation"][0], + tokenizer, + data_args.max_seq_length, + data_args.doc_stride + ) + exporter.export_onnx(sample_batch=sample_batch, convert_qat=True) + #################################################################################### + # End SparseML Integration + #################################################################################### + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 27155208be5f..009613b38a46 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -14,7 +14,89 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Fine-tuning the library models for question answering. +Example script for integrating spaseml with the transformers library. +This script is addopted from hugging face's implementation for Question Answering on the SQUAD Dataset. +Hugging Face's original implementation is regularly updated and can be found at https://github.com/huggingface/transformers/blob/master/examples/question-answering/run_qa.py +This script will: +- Load transformer based modesl +- Load a sparseml training and pruning optimizer +- Train on SQUAD +- Evaluate on SQUAD +- Export model to onnx. +########## +Command help: +usage: run_qa.py [-h] \ + --model_name_or_path MODEL \ + [--dataset_name] \ + [--num_train_epochs] \ + [--do_train] \ + [--do_eval] \ + [--per_device_train_batch_size] \ + [--per_device_eval_batch_size] \ + [--learning_rate]\ + [--max_seq_length]\ + [--doc_stride]\ + [--output_dir] \ + [--overwrite_output_dir] \ + [--cache_dir]\ + [--preprocessing_num_workers] \ + [--seed] 42 \ + [--nm_prune_config] + [--do_onnx_export] + [--onnx_export_path] + +Train, prune, and evaluate a transformer base question answering model on squad. + -h, --help show this help message and exit + --model_name_or_path MODEL The path to the transformers model you wish to train + or the name of the pretrained language model you wish + to use. ex: bert-base-uncased. + --dataset_name The name of which dataset you want to use to train or + your model. ex: squad for using SQuAD. + --num_train_epochs Paramater to control how many training epochs you wish + your model to train. + --do_train Boolean denoting if the model should be trained + or not. Default is false. + --do_eval Boolean denoting if the model should be evaluated + or not. Default is false. + --per_device_train_batch_size Size of each training batch based on samples per GPU. + 12 will fit in a 11gb GPU, 16 in a 16gb. + --per_device_eval_batch_size Size of each training batch based on samples per GPU. + 12 will fit in a 11gb GPU, 16 in a 16gb. + --learning_rate Learning rate initial float value. ex: 3e-5. + --max_seq_length Int for the max sequence length to be parsed as a context + window. ex: 384 tokens. + --output_dir Path which model checkpoints and paths should be saved. + --overwrite_output_dir Boolean to define if the + --cache_dir Directiory which cached transformer files(datasets, models + , tokenizers) are saved for fast loading. + --preprocessing_num_workers The amount of cpu workers which are used to process datasets + --seed Int which determines what random seed is for training/shuffling + --nm_prune_config Path to the neural magic prune configuration file. examples can + be found in prune_config_files but are customized for bert-base-uncased. + --do_onnx_export Boolean denoting if the model should be exported to onnx + --onnx_export_path Path where onnx model path will be exported. ex: onnx-export + +########## +Example command for training a 95% sparse BERT SQUAD model for 1 epoch: +python run_qa.py \ + --model_name_or_path bert-base-uncased \ + --dataset_name squad \ + --num_train_epochs 1 \ + --do_train \ + --do_eval \ + --per_device_train_batch_size 12 \ + --per_device_eval_batch_size 12 \ + --learning_rate 3e-5 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --output_dir 95sparsity1epoch/ \ + --overwrite_output_dir \ + --cache_dir cache \ + --preprocessing_num_workers 8 \ + --seed 42 \ + --nm_prune_config prune_config_files/95sparsity1epoch.yaml \ + --do_onnx_export \ + --onnx_export_path 95sparsity1epoch/ """ # You can also adapt this script on your own question answering task. Pointers for this are left as comments. @@ -27,7 +109,6 @@ from datasets import load_dataset, load_metric import transformers -from trainer_qa import QuestionAnsweringTrainer from transformers import ( AutoConfig, AutoModelForQuestionAnswering, @@ -44,6 +125,11 @@ from transformers.utils import check_min_version from utils_qa import postprocess_qa_predictions +# Start SparseML integration +from sparseml_utils import SparseMLQATrainer, convert_example_to_features +from sparseml.pytorch.utils import ModuleExporter +# End SparseML integration + # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.7.0.dev0") @@ -89,6 +175,21 @@ class DataTrainingArguments: Arguments pertaining to what data we are going to input our model for training and eval. """ + #################################################################################### + # Start SparseML Integration + #################################################################################### + nm_prune_config: Optional[str] = field( + default="recipes/noprune1epoch.yaml", + metadata={"help": "The input file name for the Neural Magic pruning config"}, + ) + do_onnx_export: bool = field(default=True, metadata={"help": "Export model to onnx"}) + onnx_export_path: Optional[str] = field( + default="onnx-export", metadata={"help": "The filename and path which will be where onnx model is outputed"} + ) + #################################################################################### + # End SparseML Integration + #################################################################################### + dataset_name: Optional[str] = field( default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} ) @@ -542,8 +643,12 @@ def post_processing_function(examples, features, predictions, stage="eval"): def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) + #################################################################################### + # Start SparseML Integration + #################################################################################### # Initialize our Trainer - trainer = QuestionAnsweringTrainer( + trainer = SparseMLQATrainer( + data_args.nm_prune_config, model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, @@ -554,6 +659,9 @@ def compute_metrics(p: EvalPrediction): post_process_function=post_processing_function, compute_metrics=compute_metrics, ) + #################################################################################### + # End SparseML Integration + #################################################################################### # Training if training_args.do_train: @@ -612,6 +720,25 @@ def compute_metrics(p: EvalPrediction): trainer.push_to_hub(**kwargs) + #################################################################################### + # Start SparseML Integration + #################################################################################### + if data_args.do_onnx_export: + logger.info("*** Export to ONNX ***") + os.environ["TOKENIZERS_PARALLELISM"] = "false" + exporter = ModuleExporter( + model, output_dir=data_args.onnx_export_path + ) + sample_batch = convert_example_to_features( + datasets["validation"][0], + tokenizer, + data_args.max_seq_length, + data_args.doc_stride + ) + exporter.export_onnx(sample_batch=sample_batch, convert_qat=True) + #################################################################################### + # End SparseML Integration + #################################################################################### def _mp_fn(index): # For xla_spawn (TPUs) diff --git a/examples/pytorch/question-answering/sparseml_utils.py b/examples/pytorch/question-answering/sparseml_utils.py new file mode 100644 index 000000000000..48951b6f5f59 --- /dev/null +++ b/examples/pytorch/question-answering/sparseml_utils.py @@ -0,0 +1,195 @@ +import collections +import math +import torch +import torch.nn.functional as F +import numpy +from trainer_qa import QuestionAnsweringTrainer + +from sparseml.pytorch.optim.manager import ScheduledModifierManager +from sparseml.pytorch.optim.optimizer import ScheduledOptimizer + + +class SparseMLQATrainer(QuestionAnsweringTrainer): + """ + Question Answering trainer with customized optimizer using SparseML + + :param nm_prune_config: recipe for model sparsification + :param args, kwargs: arguments passed into parent class + """ + def __init__(self, nm_prune_config, *args, **kwargs): + super().__init__(*args, **kwargs) + self.nm_prune_config = nm_prune_config + + def create_optimizer(self): + """ + Create optimizer customized using SparseML + """ + super().create_optimizer() + steps_per_epoch = math.ceil(len(self.train_dataset) / (self.args.per_device_train_batch_size * self.args._n_gpu)) + manager = ScheduledModifierManager.from_yaml(self.nm_prune_config) + self.args.num_train_epochs = float(manager.max_epochs) + self.optimizer = ScheduledOptimizer(self.optimizer, self.model, manager, steps_per_epoch=steps_per_epoch, loggers=None) + + +class SparseMLDistillQATrainer(SparseMLQATrainer): + """ + Question Answering trainer using distilation with customized optimizer using SparseML + + :param nm_prune_config: recipe for model sparsification + :param teacher: teacher model + :param distill_hardness: weight of the teacher loss + :param temperature: temperature used for loss + :param args, kwargs: arguments passed into parent class + """ + def __init__(self, nm_prune_config, teacher=None, distill_hardness=0.5, temperature=2.0, *args, **kwargs): + super().__init__(nm_prune_config, *args, **kwargs) + self.teacher = teacher + self.distill_hardness = distill_hardness + self.temperature = temperature + self.criterion = torch.nn.CrossEntropyLoss() + + def compute_loss(self, model, inputs, return_outputs=False): + """ + Computing loss using teacher/student distillation + """ + outputs = model(**inputs) + loss = outputs['loss'] + if self.teacher is not None: + input_device = inputs["input_ids"].device + self.teacher = self.teacher.to(input_device) + start_logits_student = outputs["start_logits"] + end_logits_student = outputs["end_logits"] + start_logits_label = inputs["start_positions"] + end_logits_label = inputs["start_positions"] + with torch.no_grad(): + teacher_output = self.teacher( + input_ids=inputs["input_ids"], + token_type_ids=inputs["token_type_ids"], + attention_mask=inputs["attention_mask"], + ) + start_logits_teacher = teacher_output["start_logits"] + end_logits_teacher = teacher_output["end_logits"] + loss_start = ( + F.kl_div( + input=F.log_softmax(start_logits_student / self.temperature, dim=-1), + target=F.softmax(start_logits_teacher / self.temperature, dim=-1), + reduction="batchmean", + ) + * (self.temperature ** 2) + ) + loss_end = ( + F.kl_div( + input=F.log_softmax(end_logits_student / self.temperature, dim=-1), + target=F.softmax(end_logits_teacher / self.temperature, dim=-1), + reduction="batchmean", + ) + * (self.temperature ** 2) + ) + teacher_loss = (loss_start + loss_end) / 2.0 + loss_start = self.criterion(start_logits_student, start_logits_label) + loss_end = self.criterion(end_logits_student, end_logits_label) + label_loss = (loss_start + loss_end) / 2.0 + loss = ((1-self.distill_hardness) * label_loss) + (self.distill_hardness * teacher_loss) + return (loss, outputs) if return_outputs else loss + + +def convert_example_to_features(example, tokenizer, max_seq_length, doc_stride, max_query_length=30): + """ + Convert example to features, used for onnx export + """ + Feature = collections.namedtuple( + "Feature", + [ + "unique_id", + "tokens", + "example_index", + "token_to_orig_map", + "token_is_max_context", + ], + ) + extra = [] + unique_id = 0 + query_tokens = tokenizer.tokenize(example["question"])[0:max_query_length] + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example["context"]): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_to_orig_map = {} + token_is_max_context = {} + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in query_tokens: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + for i in range(doc_span.length): + split_token_index = doc_span.start + i + token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + is_max_context = _check_is_max_context( + doc_spans, doc_span_index, split_token_index + ) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + input_ids = tokenizer.convert_tokens_to_ids(tokens) + input_mask = [1] * len(input_ids) + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + feature = Feature( + unique_id=unique_id, + tokens=tokens, + example_index=0, + token_to_orig_map=token_to_orig_map, + token_is_max_context=token_is_max_context, + ) + extra.append(feature) + unique_id += 1 + # extra is used as additional data but sparseml doesn't support it + return ( + torch.from_numpy(numpy.array([numpy.array(input_ids, dtype=numpy.int64)])), + torch.from_numpy(numpy.array([numpy.array(input_mask, dtype=numpy.int64)])), + torch.from_numpy(numpy.array([numpy.array(segment_ids, dtype=numpy.int64)])), + ) + + +def _check_is_max_context(doc_spans, cur_span_index, position): + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + return cur_span_index == best_span_index From ab5f8521b095ccca62b2fe4418ec9367fcc3a932 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Fri, 4 Jun 2021 07:25:45 -0400 Subject: [PATCH 2/7] Overwrite scaler's step if it exists (for amp mode) --- .../question-answering/sparseml_utils.py | 34 ++++++++++++------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/examples/pytorch/question-answering/sparseml_utils.py b/examples/pytorch/question-answering/sparseml_utils.py index 48951b6f5f59..0702942c6c77 100644 --- a/examples/pytorch/question-answering/sparseml_utils.py +++ b/examples/pytorch/question-answering/sparseml_utils.py @@ -16,6 +16,7 @@ class SparseMLQATrainer(QuestionAnsweringTrainer): :param nm_prune_config: recipe for model sparsification :param args, kwargs: arguments passed into parent class """ + def __init__(self, nm_prune_config, *args, **kwargs): super().__init__(*args, **kwargs) self.nm_prune_config = nm_prune_config @@ -25,10 +26,20 @@ def create_optimizer(self): Create optimizer customized using SparseML """ super().create_optimizer() - steps_per_epoch = math.ceil(len(self.train_dataset) / (self.args.per_device_train_batch_size * self.args._n_gpu)) + steps_per_epoch = math.ceil( + len(self.train_dataset) / (self.args.per_device_train_batch_size * self.args._n_gpu) + ) manager = ScheduledModifierManager.from_yaml(self.nm_prune_config) self.args.num_train_epochs = float(manager.max_epochs) - self.optimizer = ScheduledOptimizer(self.optimizer, self.model, manager, steps_per_epoch=steps_per_epoch, loggers=None) + if hasattr(self, "scaler"): + manager.initialize(self.model, epoch=0.0) + self.scaler = manager.modify( + self.model, self.optimizer, steps_per_epoch=steps_per_epoch, wrap_optim=self.scaler + ) + else: + self.optimizer = ScheduledOptimizer( + self.optimizer, self.model, manager, steps_per_epoch=steps_per_epoch, loggers=None + ) class SparseMLDistillQATrainer(SparseMLQATrainer): @@ -41,6 +52,7 @@ class SparseMLDistillQATrainer(SparseMLQATrainer): :param temperature: temperature used for loss :param args, kwargs: arguments passed into parent class """ + def __init__(self, nm_prune_config, teacher=None, distill_hardness=0.5, temperature=2.0, *args, **kwargs): super().__init__(nm_prune_config, *args, **kwargs) self.teacher = teacher @@ -53,7 +65,7 @@ def compute_loss(self, model, inputs, return_outputs=False): Computing loss using teacher/student distillation """ outputs = model(**inputs) - loss = outputs['loss'] + loss = outputs["loss"] if self.teacher is not None: input_device = inputs["input_ids"].device self.teacher = self.teacher.to(input_device) @@ -63,10 +75,10 @@ def compute_loss(self, model, inputs, return_outputs=False): end_logits_label = inputs["start_positions"] with torch.no_grad(): teacher_output = self.teacher( - input_ids=inputs["input_ids"], - token_type_ids=inputs["token_type_ids"], - attention_mask=inputs["attention_mask"], - ) + input_ids=inputs["input_ids"], + token_type_ids=inputs["token_type_ids"], + attention_mask=inputs["attention_mask"], + ) start_logits_teacher = teacher_output["start_logits"] end_logits_teacher = teacher_output["end_logits"] loss_start = ( @@ -89,8 +101,8 @@ def compute_loss(self, model, inputs, return_outputs=False): loss_start = self.criterion(start_logits_student, start_logits_label) loss_end = self.criterion(end_logits_student, end_logits_label) label_loss = (loss_start + loss_end) / 2.0 - loss = ((1-self.distill_hardness) * label_loss) + (self.distill_hardness * teacher_loss) - return (loss, outputs) if return_outputs else loss + loss = ((1 - self.distill_hardness) * label_loss) + (self.distill_hardness * teacher_loss) + return (loss, outputs) if return_outputs else loss def convert_example_to_features(example, tokenizer, max_seq_length, doc_stride, max_query_length=30): @@ -146,9 +158,7 @@ def convert_example_to_features(example, tokenizer, max_seq_length, doc_stride, for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] - is_max_context = _check_is_max_context( - doc_spans, doc_span_index, split_token_index - ) + is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) segment_ids.append(1) From f9c843b6f5ec58a279359ec023c70533f82224ed Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Mon, 7 Jun 2021 11:21:07 -0400 Subject: [PATCH 3/7] Add wandb logger --- .../question-answering/sparseml_utils.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/examples/pytorch/question-answering/sparseml_utils.py b/examples/pytorch/question-answering/sparseml_utils.py index 0702942c6c77..f0c1f124a4ea 100644 --- a/examples/pytorch/question-answering/sparseml_utils.py +++ b/examples/pytorch/question-answering/sparseml_utils.py @@ -8,6 +8,8 @@ from sparseml.pytorch.optim.manager import ScheduledModifierManager from sparseml.pytorch.optim.optimizer import ScheduledOptimizer +from sparseml.pytorch.utils import logger + class SparseMLQATrainer(QuestionAnsweringTrainer): """ @@ -20,6 +22,11 @@ class SparseMLQATrainer(QuestionAnsweringTrainer): def __init__(self, nm_prune_config, *args, **kwargs): super().__init__(*args, **kwargs) self.nm_prune_config = nm_prune_config + self.manager = None + loggers = [] + if "wandb" in self.args.report_to: + loggers.append(logger.WANDBLogger()) + self.loggers = loggers def create_optimizer(self): """ @@ -29,16 +36,16 @@ def create_optimizer(self): steps_per_epoch = math.ceil( len(self.train_dataset) / (self.args.per_device_train_batch_size * self.args._n_gpu) ) - manager = ScheduledModifierManager.from_yaml(self.nm_prune_config) - self.args.num_train_epochs = float(manager.max_epochs) + self.manager = ScheduledModifierManager.from_yaml(self.nm_prune_config) + self.args.num_train_epochs = float(self.manager.max_epochs) if hasattr(self, "scaler"): - manager.initialize(self.model, epoch=0.0) - self.scaler = manager.modify( + self.manager.initialize(self.model, epoch=0.0, loggers=self.loggers) + self.scaler = self.manager.modify( self.model, self.optimizer, steps_per_epoch=steps_per_epoch, wrap_optim=self.scaler ) else: self.optimizer = ScheduledOptimizer( - self.optimizer, self.model, manager, steps_per_epoch=steps_per_epoch, loggers=None + self.optimizer, self.model, self.manager, steps_per_epoch=steps_per_epoch, loggers=self.loggers ) From 2d580c7060e145aa9a9e561bf21ef2e8b3e6196d Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Thu, 10 Jun 2021 07:25:36 -0400 Subject: [PATCH 4/7] Remove distill script (to be unified with run_qa), recipes (to be moved to sparseml) --- .../recipes/finetune_squad_2epochs.yaml | 6 - ..._80blocksparse_freq0.01_18prune10fine.yaml | 871 ------------------ .../question-answering/run_distill_qa.py | 791 ---------------- 3 files changed, 1668 deletions(-) delete mode 100644 examples/pytorch/question-answering/recipes/finetune_squad_2epochs.yaml delete mode 100644 examples/pytorch/question-answering/recipes/uni_80blocksparse_freq0.01_18prune10fine.yaml delete mode 100755 examples/pytorch/question-answering/run_distill_qa.py diff --git a/examples/pytorch/question-answering/recipes/finetune_squad_2epochs.yaml b/examples/pytorch/question-answering/recipes/finetune_squad_2epochs.yaml deleted file mode 100644 index 8529afa2f4f6..000000000000 --- a/examples/pytorch/question-answering/recipes/finetune_squad_2epochs.yaml +++ /dev/null @@ -1,6 +0,0 @@ -version: 1.1.0 - -modifiers: - - !EpochRangeModifier - end_epoch: 2 - start_epoch: 0.0 \ No newline at end of file diff --git a/examples/pytorch/question-answering/recipes/uni_80blocksparse_freq0.01_18prune10fine.yaml b/examples/pytorch/question-answering/recipes/uni_80blocksparse_freq0.01_18prune10fine.yaml deleted file mode 100644 index 1673f5441e49..000000000000 --- a/examples/pytorch/question-answering/recipes/uni_80blocksparse_freq0.01_18prune10fine.yaml +++ /dev/null @@ -1,871 +0,0 @@ -version: 1.1.0 - -modifiers: - - !EpochRangeModifier - end_epoch: 30 - start_epoch: 0.0 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.0.attention.self.query.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.0.attention.self.key.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.0.attention.self.value.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.0.attention.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.0.intermediate.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.0.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.1.attention.self.query.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.1.attention.self.key.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.1.attention.self.value.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.1.attention.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.1.intermediate.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.1.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.2.attention.self.query.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.2.attention.self.key.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.2.attention.self.value.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.2.attention.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.2.intermediate.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.2.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.3.attention.self.query.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.3.attention.self.key.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.3.attention.self.value.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.3.attention.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.3.intermediate.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.3.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.4.attention.self.query.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.4.attention.self.key.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.4.attention.self.value.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.4.attention.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.4.intermediate.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.4.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.5.attention.self.query.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.5.attention.self.key.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.5.attention.self.value.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.5.attention.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.5.intermediate.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.5.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.6.attention.self.query.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.6.attention.self.key.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.6.attention.self.value.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.6.attention.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.6.intermediate.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.6.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.7.attention.self.query.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.7.attention.self.key.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.7.attention.self.value.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.7.attention.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.7.intermediate.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.7.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.8.attention.self.query.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.8.attention.self.key.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.8.attention.self.value.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.8.attention.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.8.intermediate.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.8.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.9.attention.self.query.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.9.attention.self.key.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.9.attention.self.value.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.9.attention.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.9.intermediate.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.9.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.10.attention.self.query.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.10.attention.self.key.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.10.attention.self.value.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.10.attention.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.10.intermediate.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.10.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.11.attention.self.query.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.11.attention.self.key.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.11.attention.self.value.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.11.attention.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.11.intermediate.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - - - !GMPruningModifier - end_epoch: 20 - final_sparsity: 0.80 - init_sparsity: 0.00 - inter_func: cubic - leave_enabled: True - log_types: __ALL__ - mask_type: [1,4] - params: ['bert.encoder.layer.11.output.dense.weight'] - start_epoch: 2 - update_frequency: 0.01 - diff --git a/examples/pytorch/question-answering/run_distill_qa.py b/examples/pytorch/question-answering/run_distill_qa.py deleted file mode 100755 index e1009ee67ec4..000000000000 --- a/examples/pytorch/question-answering/run_distill_qa.py +++ /dev/null @@ -1,791 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright 2020 The HuggingFace Team All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Example script for integrating spaseml with the transformers library to perform model distillation. -This script is addopted from hugging face's implementation for Question Answering on the SQUAD Dataset. -Hugging Face's original implementation is regularly updated and can be found at https://github.com/huggingface/transformers/blob/master/examples/question-answering/run_qa.py -This script will: -- Load transformer based models -- Load a sparseml training and pruning optimizer -- Train on SQUAD -- Evaluate on SQUAD -- Export model to onnx. -########## -Command help: -usage: run_distill_qa.py [-h] \ - [--teacher_model_name_or_path] \ - [--student_model_name_or_path] \ - [--temperature] \ - [--distill_hardness] \ - [--dataset_name] \ - [--num_train_epochs] \ - [--do_train] \ - [--do_eval] \ - [--per_device_train_batch_size] \ - [--per_device_eval_batch_size] \ - [--learning_rate]\ - [--max_seq_length]\ - [--doc_stride]\ - [--output_dir] \ - [--overwrite_output_dir] \ - [--cache_dir]\ - [--preprocessing_num_workers] \ - [--seed] 42 \ - [--nm_prune_config] \ - [--do_onnx_export] \ - [--onnx_export_path] \ - [--layers_to_keep] \ - -Train, prune, and evaluate a transformer base question answering model on squad. - -h, --help show this help message and exit - --teacher_model_name_or_path The name or path of model which will be used for distilation. - Note, this model needs to be trained for QA task already. - --student_model_name_or_path The path to the transformers model you wish to train - or the name of the pretrained language model you wish - to use. ex: bert-base-uncased. - --temperature Hyperparameter which controls model distilation - --distill_hardness Hyperparameter which controls how much of the loss comes from teacher vs training labels - --model_name_or_path The path to the transformers model you wish to train - --temperature Hyperparameter which controls model distilation - --distill_hardness Hyperparameter which controls how much of the loss comes from teacher vs training labels - --dataset_name The name of which dataset you want to use to train or - your model. ex: squad for using SQuAD. - --num_train_epochs Paramater to control how many training epochs you wish - your model to train. - --do_train Boolean denoting if the model should be trained - or not. Default is false. - --do_eval Boolean denoting if the model should be evaluated - or not. Default is false. - --per_device_train_batch_size Size of each training batch based on samples per GPU. - 12 will fit in a 11gb GPU, 16 in a 16gb. - --per_device_eval_batch_size Size of each training batch based on samples per GPU. - 12 will fit in a 11gb GPU, 16 in a 16gb. - --learning_rate Learning rate initial float value. ex: 3e-5. - --max_seq_length Int for the max sequence length to be parsed as a context - window. ex: 384 tokens. - --output_dir Path which model checkpoints and paths should be saved. - --overwrite_output_dir Boolean to define if the - --cache_dir Directiory which cached transformer files(datasets, models - , tokenizers) are saved for fast loading. - --preprocessing_num_workers The amount of cpu workers which are used to process datasets - --seed Int which determines what random seed is for training/shuffling - --nm_prune_config Path to the neural magic prune configuration file. examples can - be found in prune_config_files but are customized for bert-base-uncased. - --do_onnx_export Boolean denoting if the model should be exported to onnx - --onnx_export_path Path where onnx model path will be exported. ex: onnx-export - --layers_to_keep Number of layers to keep from original model. Layers are dropped before training - -########## -Example command for training a 95% sparse BERT SQUAD model for 1 epoch with a unpruned teacher: -python run_distill_qa.py \ - --teacher_model_name_or_path models/neuralmagic-bert-squad-12layer-0sparse - --student_model_name_or_path bert-base-uncased \ - --dataset_name squad \ - --num_train_epochs 1 \ - --do_train \ - --do_eval \ - --per_device_train_batch_size 12 \ - --per_device_eval_batch_size 12 \ - --learning_rate 3e-5 \ - --max_seq_length 384 \ - --doc_stride 128 \ - --output_dir 95sparsity1epoch/ \ - --overwrite_output_dir \ - --cache_dir cache \ - --preprocessing_num_workers 8 \ - --seed 42 \ - --nm_prune_config prune_config_files/95sparsity1epoch.yaml \ - --do_onnx_export \ - --onnx_export_path 95sparsity1epoch/ \ - --distill_hardness 0.5 \ - --temperature 2.0 \ -""" -# You can also adapt this script on your own question answering task. Pointers for this are left as comments. - -import logging -import os -import numpy as np -import sys -from dataclasses import dataclass, field -from typing import Optional - -from datasets import load_dataset, load_metric - -import transformers -from transformers import ( - AutoConfig, - AutoModelForQuestionAnswering, - AutoTokenizer, - DataCollatorWithPadding, - EvalPrediction, - HfArgumentParser, - PreTrainedTokenizerFast, - TrainingArguments, - default_data_collator, - set_seed, -) -from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version -from utils_qa import postprocess_qa_predictions - -# Start SparseML integration -from sparseml_utils import SparseMLDistillQATrainer, convert_example_to_features -from sparseml.pytorch.utils import ModuleExporter -# End SparseML integration - - -# Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.7.0.dev0") - -logger = logging.getLogger(__name__) - - -@dataclass -class ModelArguments: - """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. - """ - teacher_model_name_or_path: Optional[str] = field( - default="spacemanidol/neuralmagic-bert-squad-12layer-0sparse", metadata={"help": "Teacher model which needs to be a trained QA model"} - ) - student_model_name_or_path: Optional[str] = field( - default="bert-base-uncased", metadata={"help": "Student model"} - ) - temperature: Optional[float] = field( - default=2.0, metadata={"help": "Temperature applied to teacher softmax for distillation."} - ) - distill_hardness: Optional[float] = field( - default=1.0, metadata={"help": "Proportion of loss coming from teacher model."} - ) - config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} - ) - tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} - ) - cache_dir: Optional[str] = field( - default=None, - metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"}, - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - use_auth_token: bool = field( - default=False, - metadata={ - "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " - "with private models)." - }, - ) - - -@dataclass -class DataTrainingArguments: - """ - Arguments pertaining to what data we are going to input our model for training and eval. - """ - - #################################################################################### - # Start SparseML Integration - #################################################################################### - nm_prune_config: Optional[str] = field( - default="recipes/noprune1epoch.yaml", - metadata={"help": "The input file name for the Neural Magic pruning config"}, - ) - do_onnx_export: bool = field(default=True, metadata={"help": "Export model to onnx"}) - onnx_export_path: Optional[str] = field( - default="onnx-export", metadata={"help": "The filename and path which will be where onnx model is outputed"} - ) - #################################################################################### - # End SparseML Integration - #################################################################################### - - dataset_name: Optional[str] = field( - default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} - ) - dataset_config_name: Optional[str] = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} - ) - train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) - validation_file: Optional[str] = field( - default=None, - metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, - ) - test_file: Optional[str] = field( - default=None, - metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."}, - ) - overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} - ) - preprocessing_num_workers: Optional[int] = field( - default=None, - metadata={"help": "The number of processes to use for the preprocessing."}, - ) - max_seq_length: int = field( - default=384, - metadata={ - "help": "The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded." - }, - ) - pad_to_max_length: bool = field( - default=True, - metadata={ - "help": "Whether to pad all samples to `max_seq_length`. " - "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can " - "be faster on GPU but will be slower on TPU)." - }, - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - }, - ) - max_predict_samples: Optional[int] = field( - default=None, - metadata={ - "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " - "value if set." - }, - ) - version_2_with_negative: bool = field( - default=False, metadata={"help": "If true, some of the examples do not have an answer."} - ) - null_score_diff_threshold: float = field( - default=0.0, - metadata={ - "help": "The threshold used to select the null answer: if the best answer has a score that is less than " - "the score of the null answer minus this threshold, the null answer is selected for this example. " - "Only useful when `version_2_with_negative=True`." - }, - ) - doc_stride: int = field( - default=128, - metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."}, - ) - n_best_size: int = field( - default=20, - metadata={"help": "The total number of n-best predictions to generate when looking for an answer."}, - ) - max_answer_length: int = field( - default=30, - metadata={ - "help": "The maximum length of an answer that can be generated. This is needed because the start " - "and end predictions are not conditioned on one another." - }, - ) - - def __post_init__(self): - if ( - self.dataset_name is None - and self.train_file is None - and self.validation_file is None - and self.test_file is None - ): - raise ValueError("Need either a dataset name or a training/validation file/test_file.") - else: - if self.train_file is not None: - extension = self.train_file.split(".")[-1] - assert extension in ["csv", "json"], "`train_file` should be a csv or a json file." - if self.validation_file is not None: - extension = self.validation_file.split(".")[-1] - assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." - if self.test_file is not None: - extension = self.test_file.split(".")[-1] - assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." - - -def main(): - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - # We now keep distinct sets of args, for a cleaner separation of concerns. - - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args = parser.parse_args_into_dataclasses() - - # Detecting last checkpoint. - last_checkpoint = None - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: - last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - - # Setup logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - logger.setLevel(logging.INFO if training_args.should_log else logging.WARN) - - # Log on each process the small summary: - logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" - ) - # Set the verbosity to info of the Transformers logger (on main process only): - if training_args.should_log: - transformers.utils.logging.set_verbosity_info() - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - logger.info(f"Training/evaluation parameters {training_args}") - - # Set seed before initializing model. - set_seed(training_args.seed) - - # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) - # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub). - # - # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called - # 'text' is found. You can easily tweak this behavior (see below). - # - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - if data_args.dataset_name is not None: - # Downloading and loading a dataset from the hub. - datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) - else: - data_files = {} - if data_args.train_file is not None: - data_files["train"] = data_args.train_file - extension = data_args.train_file.split(".")[-1] - - if data_args.validation_file is not None: - data_files["validation"] = data_args.validation_file - extension = data_args.validation_file.split(".")[-1] - if data_args.test_file is not None: - data_files["test"] = data_args.test_file - extension = data_args.test_file.split(".")[-1] - datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - # Load pretrained model and tokenizer - # - # Distributed training: - # The .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - config = AutoConfig.from_pretrained( - model_args.config_name if model_args.config_name else model_args.student_model_name_or_path, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name if model_args.tokenizer_name else model_args.student_model_name_or_path, - cache_dir=model_args.cache_dir, - use_fast=True, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - student_model = AutoModelForQuestionAnswering.from_pretrained( - model_args.student_model_name_or_path, - from_tf=bool(".ckpt" in model_args.student_model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - teacher_model = None - if model_args.teacher_model_name_or_path != None: - teacher_model = AutoModelForQuestionAnswering.from_pretrained( - model_args.teacher_model_name_or_path, - from_tf=bool(".ckpt" in model_args.teacher_model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - ) - teacher_model_parameters = filter(lambda p: p.requires_grad, teacher_model.parameters()) - params = sum([np.prod(p.size()) for p in teacher_model_parameters]) - logger.info("Teacher Model has %s parameters", params) - - student_model_parameters = filter(lambda p: p.requires_grad, student_model.parameters()) - params = sum([np.prod(p.size()) for p in student_model_parameters]) - logger.info("Student Model has %s parameters", params) - - # Tokenizer check: this script requires a fast tokenizer. - if not isinstance(tokenizer, PreTrainedTokenizerFast): - raise ValueError( - "This example script only works for models that have a fast tokenizer. Checkout the big table of models " - "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this " - "requirement" - ) - - # Preprocessing the datasets. - # Preprocessing is slighlty different for training and evaluation. - if training_args.do_train: - column_names = datasets["train"].column_names - elif training_args.do_eval: - column_names = datasets["validation"].column_names - else: - column_names = datasets["test"].column_names - question_column_name = "question" if "question" in column_names else column_names[0] - context_column_name = "context" if "context" in column_names else column_names[1] - answer_column_name = "answers" if "answers" in column_names else column_names[2] - - # Padding side determines if we do (question|context) or (context|question). - pad_on_right = tokenizer.padding_side == "right" - - if data_args.max_seq_length > tokenizer.model_max_length: - logger.warning( - f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" - f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." - ) - max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) - - # Training preprocessing - def prepare_train_features(examples): - # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results - # in one example possible giving several features when a context is long, each of those features having a - # context that overlaps a bit the context of the previous feature. - tokenized_examples = tokenizer( - examples[question_column_name if pad_on_right else context_column_name], - examples[context_column_name if pad_on_right else question_column_name], - truncation="only_second" if pad_on_right else "only_first", - max_length=max_seq_length, - stride=data_args.doc_stride, - return_overflowing_tokens=True, - return_offsets_mapping=True, - padding="max_length" if data_args.pad_to_max_length else False, - ) - - # Since one example might give us several features if it has a long context, we need a map from a feature to - # its corresponding example. This key gives us just that. - sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") - # The offset mappings will give us a map from token to character position in the original context. This will - # help us compute the start_positions and end_positions. - offset_mapping = tokenized_examples.pop("offset_mapping") - - # Let's label those examples! - tokenized_examples["start_positions"] = [] - tokenized_examples["end_positions"] = [] - - for i, offsets in enumerate(offset_mapping): - # We will label impossible answers with the index of the CLS token. - input_ids = tokenized_examples["input_ids"][i] - cls_index = input_ids.index(tokenizer.cls_token_id) - - # Grab the sequence corresponding to that example (to know what is the context and what is the question). - sequence_ids = tokenized_examples.sequence_ids(i) - - # One example can give several spans, this is the index of the example containing this span of text. - sample_index = sample_mapping[i] - answers = examples[answer_column_name][sample_index] - # If no answers are given, set the cls_index as answer. - if len(answers["answer_start"]) == 0: - tokenized_examples["start_positions"].append(cls_index) - tokenized_examples["end_positions"].append(cls_index) - else: - # Start/end character index of the answer in the text. - start_char = answers["answer_start"][0] - end_char = start_char + len(answers["text"][0]) - - # Start token index of the current span in the text. - token_start_index = 0 - while sequence_ids[token_start_index] != (1 if pad_on_right else 0): - token_start_index += 1 - - # End token index of the current span in the text. - token_end_index = len(input_ids) - 1 - while sequence_ids[token_end_index] != (1 if pad_on_right else 0): - token_end_index -= 1 - - # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). - if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): - tokenized_examples["start_positions"].append(cls_index) - tokenized_examples["end_positions"].append(cls_index) - else: - # Otherwise move the token_start_index and token_end_index to the two ends of the answer. - # Note: we could go after the last offset if the answer is the last word (edge case). - while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char: - token_start_index += 1 - tokenized_examples["start_positions"].append(token_start_index - 1) - while offsets[token_end_index][1] >= end_char: - token_end_index -= 1 - tokenized_examples["end_positions"].append(token_end_index + 1) - - return tokenized_examples - - if training_args.do_train: - if "train" not in datasets: - raise ValueError("--do_train requires a train dataset") - train_dataset = datasets["train"] - if data_args.max_train_samples is not None: - # We will select sample from whole data if agument is specified - train_dataset = train_dataset.select(range(data_args.max_train_samples)) - # Create train feature from dataset - train_dataset = train_dataset.map( - prepare_train_features, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - ) - if data_args.max_train_samples is not None: - # Number of samples might increase during Feature Creation, We select only specified max samples - train_dataset = train_dataset.select(range(data_args.max_train_samples)) - - # Validation preprocessing - def prepare_validation_features(examples): - # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results - # in one example possible giving several features when a context is long, each of those features having a - # context that overlaps a bit the context of the previous feature. - tokenized_examples = tokenizer( - examples[question_column_name if pad_on_right else context_column_name], - examples[context_column_name if pad_on_right else question_column_name], - truncation="only_second" if pad_on_right else "only_first", - max_length=max_seq_length, - stride=data_args.doc_stride, - return_overflowing_tokens=True, - return_offsets_mapping=True, - padding="max_length" if data_args.pad_to_max_length else False, - ) - - # Since one example might give us several features if it has a long context, we need a map from a feature to - # its corresponding example. This key gives us just that. - sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") - - # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the - # corresponding example_id and we will store the offset mappings. - tokenized_examples["example_id"] = [] - - for i in range(len(tokenized_examples["input_ids"])): - # Grab the sequence corresponding to that example (to know what is the context and what is the question). - sequence_ids = tokenized_examples.sequence_ids(i) - context_index = 1 if pad_on_right else 0 - - # One example can give several spans, this is the index of the example containing this span of text. - sample_index = sample_mapping[i] - tokenized_examples["example_id"].append(examples["id"][sample_index]) - - # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token - # position is part of the context or not. - tokenized_examples["offset_mapping"][i] = [ - (o if sequence_ids[k] == context_index else None) - for k, o in enumerate(tokenized_examples["offset_mapping"][i]) - ] - - return tokenized_examples - - if training_args.do_eval: - if "validation" not in datasets: - raise ValueError("--do_eval requires a validation dataset") - eval_examples = datasets["validation"] - if data_args.max_eval_samples is not None: - # We will select sample from whole data - eval_examples = eval_examples.select(range(data_args.max_eval_samples)) - # Validation Feature Creation - eval_dataset = eval_examples.map( - prepare_validation_features, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - ) - if data_args.max_eval_samples is not None: - # During Feature creation dataset samples might increase, we will select required samples again - eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) - - if training_args.do_predict: - if "test" not in datasets: - raise ValueError("--do_predict requires a test dataset") - predict_examples = datasets["test"] - if data_args.max_predict_samples is not None: - # We will select sample from whole data - predict_examples = predict_examples.select(range(data_args.max_predict_samples)) - # Predict Feature Creation - predict_dataset = predict_examples.map( - prepare_validation_features, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - ) - if data_args.max_predict_samples is not None: - # During Feature creation dataset samples might increase, we will select required samples again - predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) - - # Data collator - # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data - # collator. - data_collator = ( - default_data_collator - if data_args.pad_to_max_length - else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) - ) - - # Post-processing: - def post_processing_function(examples, features, predictions, stage="eval"): - # Post-processing: we match the start logits and end logits to answers in the original context. - predictions = postprocess_qa_predictions( - examples=examples, - features=features, - predictions=predictions, - version_2_with_negative=data_args.version_2_with_negative, - n_best_size=data_args.n_best_size, - max_answer_length=data_args.max_answer_length, - null_score_diff_threshold=data_args.null_score_diff_threshold, - output_dir=training_args.output_dir, - is_world_process_zero=trainer.is_world_process_zero(), - prefix=stage, - ) - # Format the result to the format the metric expects. - if data_args.version_2_with_negative: - formatted_predictions = [ - {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() - ] - else: - formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] - - references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples] - return EvalPrediction(predictions=formatted_predictions, label_ids=references) - - metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad") - - def compute_metrics(p: EvalPrediction): - return metric.compute(predictions=p.predictions, references=p.label_ids) - - #################################################################################### - # Start SparseML Integration - #################################################################################### - # Initialize our Trainer - trainer = SparseMLDistillQATrainer( - data_args.nm_prune_config, - teacher=teacher_model, - distill_hardness = model_args.distill_hardness, - temperature = model_args.temperature, - model=student_model, - args=training_args, - train_dataset=train_dataset if training_args.do_train else None, - eval_dataset=eval_dataset if training_args.do_eval else None, - eval_examples=eval_examples if training_args.do_eval else None, - tokenizer=tokenizer, - data_collator=data_collator, - post_process_function=post_processing_function, - compute_metrics=compute_metrics, - ) - #################################################################################### - # End SparseML Integration - #################################################################################### - - # Training - if training_args.do_train: - checkpoint = None - if training_args.resume_from_checkpoint is not None: - checkpoint = training_args.resume_from_checkpoint - elif last_checkpoint is not None: - checkpoint = last_checkpoint - train_result = trainer.train(resume_from_checkpoint=checkpoint) - trainer.save_model() # Saves the tokenizer too for easy upload - - metrics = train_result.metrics - max_train_samples = ( - data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) - ) - metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - - trainer.log_metrics("train", metrics) - trainer.save_metrics("train", metrics) - trainer.save_state() - - # Evaluation - if training_args.do_eval: - logger.info("*** Evaluate ***") - metrics = trainer.evaluate() - - max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) - metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) - - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - # Prediction - if training_args.do_predict: - logger.info("*** Predict ***") - results = trainer.predict(predict_dataset, predict_examples) - metrics = results.metrics - - max_predict_samples = ( - data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset) - ) - metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) - - trainer.log_metrics("predict", metrics) - trainer.save_metrics("predict", metrics) - - if training_args.push_to_hub: - kwargs = {"finetuned_from": model_args.model_name_or_path, "tags": "question-answering"} - if data_args.dataset_name is not None: - kwargs["dataset_tags"] = data_args.dataset_name - if data_args.dataset_config_name is not None: - kwargs["dataset_args"] = data_args.dataset_config_name - kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" - else: - kwargs["dataset"] = data_args.dataset_name - - trainer.push_to_hub(**kwargs) - - #################################################################################### - # Start SparseML Integration - #################################################################################### - if data_args.do_onnx_export: - logger.info("*** Export to ONNX ***") - os.environ["TOKENIZERS_PARALLELISM"] = "false" - exporter = ModuleExporter( - student_model, output_dir=data_args.onnx_export_path - ) - sample_batch = convert_example_to_features( - datasets["validation"][0], - tokenizer, - data_args.max_seq_length, - data_args.doc_stride - ) - exporter.export_onnx(sample_batch=sample_batch, convert_qat=True) - #################################################################################### - # End SparseML Integration - #################################################################################### - -def _mp_fn(index): - # For xla_spawn (TPUs) - main() - - -if __name__ == "__main__": - main() From 9f58d85796a1d1d3c1e06418f0bb57e5c0229f65 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Thu, 10 Jun 2021 07:27:20 -0400 Subject: [PATCH 5/7] Include distillation into run_qa, code clean up --- examples/pytorch/question-answering/run_qa.py | 160 ++++-------------- 1 file changed, 35 insertions(+), 125 deletions(-) diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 009613b38a46..5fac8304002a 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -14,89 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Example script for integrating spaseml with the transformers library. -This script is addopted from hugging face's implementation for Question Answering on the SQUAD Dataset. -Hugging Face's original implementation is regularly updated and can be found at https://github.com/huggingface/transformers/blob/master/examples/question-answering/run_qa.py -This script will: -- Load transformer based modesl -- Load a sparseml training and pruning optimizer -- Train on SQUAD -- Evaluate on SQUAD -- Export model to onnx. -########## -Command help: -usage: run_qa.py [-h] \ - --model_name_or_path MODEL \ - [--dataset_name] \ - [--num_train_epochs] \ - [--do_train] \ - [--do_eval] \ - [--per_device_train_batch_size] \ - [--per_device_eval_batch_size] \ - [--learning_rate]\ - [--max_seq_length]\ - [--doc_stride]\ - [--output_dir] \ - [--overwrite_output_dir] \ - [--cache_dir]\ - [--preprocessing_num_workers] \ - [--seed] 42 \ - [--nm_prune_config] - [--do_onnx_export] - [--onnx_export_path] - -Train, prune, and evaluate a transformer base question answering model on squad. - -h, --help show this help message and exit - --model_name_or_path MODEL The path to the transformers model you wish to train - or the name of the pretrained language model you wish - to use. ex: bert-base-uncased. - --dataset_name The name of which dataset you want to use to train or - your model. ex: squad for using SQuAD. - --num_train_epochs Paramater to control how many training epochs you wish - your model to train. - --do_train Boolean denoting if the model should be trained - or not. Default is false. - --do_eval Boolean denoting if the model should be evaluated - or not. Default is false. - --per_device_train_batch_size Size of each training batch based on samples per GPU. - 12 will fit in a 11gb GPU, 16 in a 16gb. - --per_device_eval_batch_size Size of each training batch based on samples per GPU. - 12 will fit in a 11gb GPU, 16 in a 16gb. - --learning_rate Learning rate initial float value. ex: 3e-5. - --max_seq_length Int for the max sequence length to be parsed as a context - window. ex: 384 tokens. - --output_dir Path which model checkpoints and paths should be saved. - --overwrite_output_dir Boolean to define if the - --cache_dir Directiory which cached transformer files(datasets, models - , tokenizers) are saved for fast loading. - --preprocessing_num_workers The amount of cpu workers which are used to process datasets - --seed Int which determines what random seed is for training/shuffling - --nm_prune_config Path to the neural magic prune configuration file. examples can - be found in prune_config_files but are customized for bert-base-uncased. - --do_onnx_export Boolean denoting if the model should be exported to onnx - --onnx_export_path Path where onnx model path will be exported. ex: onnx-export - -########## -Example command for training a 95% sparse BERT SQUAD model for 1 epoch: -python run_qa.py \ - --model_name_or_path bert-base-uncased \ - --dataset_name squad \ - --num_train_epochs 1 \ - --do_train \ - --do_eval \ - --per_device_train_batch_size 12 \ - --per_device_eval_batch_size 12 \ - --learning_rate 3e-5 \ - --max_seq_length 384 \ - --doc_stride 128 \ - --output_dir 95sparsity1epoch/ \ - --overwrite_output_dir \ - --cache_dir cache \ - --preprocessing_num_workers 8 \ - --seed 42 \ - --nm_prune_config prune_config_files/95sparsity1epoch.yaml \ - --do_onnx_export \ - --onnx_export_path 95sparsity1epoch/ +Fine-tuning the library models for question answering. """ # You can also adapt this script on your own question answering task. Pointers for this are left as comments. @@ -106,9 +24,11 @@ from dataclasses import dataclass, field from typing import Optional +import numpy from datasets import load_dataset, load_metric import transformers +from sparseml_utils import SparseMLQATrainer, export_model from transformers import ( AutoConfig, AutoModelForQuestionAnswering, @@ -125,11 +45,6 @@ from transformers.utils import check_min_version from utils_qa import postprocess_qa_predictions -# Start SparseML integration -from sparseml_utils import SparseMLQATrainer, convert_example_to_features -from sparseml.pytorch.utils import ModuleExporter -# End SparseML integration - # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.7.0.dev0") @@ -142,10 +57,18 @@ class ModelArguments: """ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. """ - model_name_or_path: str = field( metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} ) + teacher_model_name_or_path: Optional[str] = field( + default=None, metadata={"help": "Teacher model which needs to be a trained QA model"} + ) + temperature: Optional[float] = field( + default=2.0, metadata={"help": "Temperature applied to teacher softmax for distillation."} + ) + distill_hardness: Optional[float] = field( + default=1.0, metadata={"help": "Proportion of loss coming from teacher model."} + ) config_name: Optional[str] = field( default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} ) @@ -175,21 +98,13 @@ class DataTrainingArguments: Arguments pertaining to what data we are going to input our model for training and eval. """ - #################################################################################### - # Start SparseML Integration - #################################################################################### - nm_prune_config: Optional[str] = field( - default="recipes/noprune1epoch.yaml", + recipe: Optional[str] = field( + default=None, metadata={"help": "The input file name for the Neural Magic pruning config"}, ) - do_onnx_export: bool = field(default=True, metadata={"help": "Export model to onnx"}) onnx_export_path: Optional[str] = field( - default="onnx-export", metadata={"help": "The filename and path which will be where onnx model is outputed"} + default=None, metadata={"help": "The filename and path which will be where onnx model is outputed"} ) - #################################################################################### - # End SparseML Integration - #################################################################################### - dataset_name: Optional[str] = field( default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} ) @@ -401,6 +316,18 @@ def main(): use_auth_token=True if model_args.use_auth_token else None, ) + teacher_model = None + if model_args.teacher_model_name_or_path is not None: + teacher_model = AutoModelForQuestionAnswering.from_pretrained( + model_args.teacher_model_name_or_path, + from_tf=bool(".ckpt" in model_args.teacher_model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + ) + teacher_model_parameters = filter(lambda p: p.requires_grad, teacher_model.parameters()) + params = sum([numpy.prod(p.size()) for p in teacher_model_parameters]) + logger.info("Teacher Model has %s parameters", params) + # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( @@ -643,12 +570,12 @@ def post_processing_function(examples, features, predictions, stage="eval"): def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) - #################################################################################### - # Start SparseML Integration - #################################################################################### # Initialize our Trainer trainer = SparseMLQATrainer( - data_args.nm_prune_config, + data_args.recipe, + teacher=teacher_model, + distill_hardness=model_args.distill_hardness, + temperature=model_args.temperature, model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, @@ -659,9 +586,6 @@ def compute_metrics(p: EvalPrediction): post_process_function=post_processing_function, compute_metrics=compute_metrics, ) - #################################################################################### - # End SparseML Integration - #################################################################################### # Training if training_args.do_train: @@ -720,25 +644,11 @@ def compute_metrics(p: EvalPrediction): trainer.push_to_hub(**kwargs) - #################################################################################### - # Start SparseML Integration - #################################################################################### - if data_args.do_onnx_export: + if data_args.onnx_export_path: logger.info("*** Export to ONNX ***") - os.environ["TOKENIZERS_PARALLELISM"] = "false" - exporter = ModuleExporter( - model, output_dir=data_args.onnx_export_path - ) - sample_batch = convert_example_to_features( - datasets["validation"][0], - tokenizer, - data_args.max_seq_length, - data_args.doc_stride - ) - exporter.export_onnx(sample_batch=sample_batch, convert_qat=True) - #################################################################################### - # End SparseML Integration - #################################################################################### + eval_dataloader = trainer.get_eval_dataloader(eval_dataset) + export_model(model, eval_dataloader, data_args.onnx_export_path) + def _mp_fn(index): # For xla_spawn (TPUs) From cea4db3d82d99f6e5fc3e617324b3712a83ef7e6 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Thu, 10 Jun 2021 07:28:23 -0400 Subject: [PATCH 6/7] Unify distill/non-distill trainer, simplify onnx export --- .../question-answering/sparseml_utils.py | 173 +++++------------- 1 file changed, 41 insertions(+), 132 deletions(-) diff --git a/examples/pytorch/question-answering/sparseml_utils.py b/examples/pytorch/question-answering/sparseml_utils.py index f0c1f124a4ea..8e0aad3b52e4 100644 --- a/examples/pytorch/question-answering/sparseml_utils.py +++ b/examples/pytorch/question-answering/sparseml_utils.py @@ -1,42 +1,52 @@ -import collections import math + import torch import torch.nn.functional as F -import numpy -from trainer_qa import QuestionAnsweringTrainer from sparseml.pytorch.optim.manager import ScheduledModifierManager from sparseml.pytorch.optim.optimizer import ScheduledOptimizer - -from sparseml.pytorch.utils import logger +from sparseml.pytorch.utils import ModuleExporter, logger +from trainer_qa import QuestionAnsweringTrainer class SparseMLQATrainer(QuestionAnsweringTrainer): """ - Question Answering trainer with customized optimizer using SparseML + Question Answering trainer with SparseML integration - :param nm_prune_config: recipe for model sparsification + :param recipe: recipe for model sparsification + :param teacher: teacher model for distillation + :param distill_hardness: ratio of loss by teacher targets (between 0 and 1) + :param temperature: temperature for distillation :param args, kwargs: arguments passed into parent class """ - def __init__(self, nm_prune_config, *args, **kwargs): + def __init__(self, recipe, teacher=None, distill_hardness=0.5, temperature=2.0, *args, **kwargs): super().__init__(*args, **kwargs) - self.nm_prune_config = nm_prune_config + self.recipe = recipe + self.teacher = teacher + self.distill_hardness = distill_hardness + self.temperature = temperature + self.criterion = torch.nn.CrossEntropyLoss() + self.manager = None - loggers = [] - if "wandb" in self.args.report_to: - loggers.append(logger.WANDBLogger()) - self.loggers = loggers + self.loggers = None + if self.recipe is not None: + loggers = [] + if "wandb" in self.args.report_to: + loggers.append(logger.WANDBLogger()) + self.loggers = loggers def create_optimizer(self): """ Create optimizer customized using SparseML """ super().create_optimizer() + if self.recipe is None: + return steps_per_epoch = math.ceil( len(self.train_dataset) / (self.args.per_device_train_batch_size * self.args._n_gpu) ) - self.manager = ScheduledModifierManager.from_yaml(self.nm_prune_config) + self.manager = ScheduledModifierManager.from_yaml(self.recipe) self.args.num_train_epochs = float(self.manager.max_epochs) if hasattr(self, "scaler"): self.manager.initialize(self.model, epoch=0.0, loggers=self.loggers) @@ -48,38 +58,23 @@ def create_optimizer(self): self.optimizer, self.model, self.manager, steps_per_epoch=steps_per_epoch, loggers=self.loggers ) - -class SparseMLDistillQATrainer(SparseMLQATrainer): - """ - Question Answering trainer using distilation with customized optimizer using SparseML - - :param nm_prune_config: recipe for model sparsification - :param teacher: teacher model - :param distill_hardness: weight of the teacher loss - :param temperature: temperature used for loss - :param args, kwargs: arguments passed into parent class - """ - - def __init__(self, nm_prune_config, teacher=None, distill_hardness=0.5, temperature=2.0, *args, **kwargs): - super().__init__(nm_prune_config, *args, **kwargs) - self.teacher = teacher - self.distill_hardness = distill_hardness - self.temperature = temperature - self.criterion = torch.nn.CrossEntropyLoss() - def compute_loss(self, model, inputs, return_outputs=False): """ Computing loss using teacher/student distillation """ + if self.recipe is None or self.teacher is None: + return super().compute_loss(model, inputs, return_outputs=return_outputs) + outputs = model(**inputs) - loss = outputs["loss"] - if self.teacher is not None: + if self.teacher is None: + loss = outputs["loss"] + else: input_device = inputs["input_ids"].device self.teacher = self.teacher.to(input_device) start_logits_student = outputs["start_logits"] end_logits_student = outputs["end_logits"] start_logits_label = inputs["start_positions"] - end_logits_label = inputs["start_positions"] + end_logits_label = inputs["end_positions"] with torch.no_grad(): teacher_output = self.teacher( input_ids=inputs["input_ids"], @@ -112,101 +107,15 @@ def compute_loss(self, model, inputs, return_outputs=False): return (loss, outputs) if return_outputs else loss -def convert_example_to_features(example, tokenizer, max_seq_length, doc_stride, max_query_length=30): +def export_model(model, dataloader, output_dir): """ - Convert example to features, used for onnx export + Export a trained model to ONNX + :param model: trained model + :param dataloader: dataloader to get sample batch + :param output_dir: output directory for ONNX model """ - Feature = collections.namedtuple( - "Feature", - [ - "unique_id", - "tokens", - "example_index", - "token_to_orig_map", - "token_is_max_context", - ], - ) - extra = [] - unique_id = 0 - query_tokens = tokenizer.tokenize(example["question"])[0:max_query_length] - tok_to_orig_index = [] - orig_to_tok_index = [] - all_doc_tokens = [] - for (i, token) in enumerate(example["context"]): - orig_to_tok_index.append(len(all_doc_tokens)) - sub_tokens = tokenizer.tokenize(token) - for sub_token in sub_tokens: - tok_to_orig_index.append(i) - all_doc_tokens.append(sub_token) - max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 - _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) - doc_spans = [] - start_offset = 0 - while start_offset < len(all_doc_tokens): - length = len(all_doc_tokens) - start_offset - if length > max_tokens_for_doc: - length = max_tokens_for_doc - doc_spans.append(_DocSpan(start=start_offset, length=length)) - if start_offset + length == len(all_doc_tokens): - break - start_offset += min(length, doc_stride) - for (doc_span_index, doc_span) in enumerate(doc_spans): - tokens = [] - token_to_orig_map = {} - token_is_max_context = {} - segment_ids = [] - tokens.append("[CLS]") - segment_ids.append(0) - for token in query_tokens: - tokens.append(token) - segment_ids.append(0) - tokens.append("[SEP]") - segment_ids.append(0) - for i in range(doc_span.length): - split_token_index = doc_span.start + i - token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] - is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) - token_is_max_context[len(tokens)] = is_max_context - tokens.append(all_doc_tokens[split_token_index]) - segment_ids.append(1) - tokens.append("[SEP]") - segment_ids.append(1) - input_ids = tokenizer.convert_tokens_to_ids(tokens) - input_mask = [1] * len(input_ids) - while len(input_ids) < max_seq_length: - input_ids.append(0) - input_mask.append(0) - segment_ids.append(0) - feature = Feature( - unique_id=unique_id, - tokens=tokens, - example_index=0, - token_to_orig_map=token_to_orig_map, - token_is_max_context=token_is_max_context, - ) - extra.append(feature) - unique_id += 1 - # extra is used as additional data but sparseml doesn't support it - return ( - torch.from_numpy(numpy.array([numpy.array(input_ids, dtype=numpy.int64)])), - torch.from_numpy(numpy.array([numpy.array(input_mask, dtype=numpy.int64)])), - torch.from_numpy(numpy.array([numpy.array(segment_ids, dtype=numpy.int64)])), - ) - - -def _check_is_max_context(doc_spans, cur_span_index, position): - best_score = None - best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): - end = doc_span.start + doc_span.length - 1 - if position < doc_span.start: - continue - if position > end: - continue - num_left_context = position - doc_span.start - num_right_context = end - position - score = min(num_left_context, num_right_context) + 0.01 * doc_span.length - if best_score is None or score > best_score: - best_score = score - best_span_index = span_index - return cur_span_index == best_span_index + exporter = ModuleExporter(model, output_dir=output_dir) + for _, sample_batch in enumerate(dataloader): + sample_input = (sample_batch["input_ids"], sample_batch["attention_mask"], sample_batch["token_type_ids"]) + exporter.export_onnx(sample_batch=sample_input, convert_qat=True) + break From dcaa723f85a9f4de64cebffd836d5ff32bad11e3 Mon Sep 17 00:00:00 2001 From: Tuan Nguyen Date: Thu, 10 Jun 2021 15:55:00 -0400 Subject: [PATCH 7/7] Simplify variable names for distillation --- examples/pytorch/question-answering/run_qa.py | 15 ++++++++------- .../question-answering/sparseml_utils.py | 18 +++++++++--------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 5fac8304002a..0cdef4787b4c 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -60,10 +60,10 @@ class ModelArguments: model_name_or_path: str = field( metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} ) - teacher_model_name_or_path: Optional[str] = field( + distill_teacher: Optional[str] = field( default=None, metadata={"help": "Teacher model which needs to be a trained QA model"} ) - temperature: Optional[float] = field( + distill_temperature: Optional[float] = field( default=2.0, metadata={"help": "Temperature applied to teacher softmax for distillation."} ) distill_hardness: Optional[float] = field( @@ -100,7 +100,8 @@ class DataTrainingArguments: recipe: Optional[str] = field( default=None, - metadata={"help": "The input file name for the Neural Magic pruning config"}, + metadata={"help": "Path to a SparseML sparsification recipe, see https://github.com/neuralmagic/sparseml " + "for more information"}, ) onnx_export_path: Optional[str] = field( default=None, metadata={"help": "The filename and path which will be where onnx model is outputed"} @@ -317,10 +318,10 @@ def main(): ) teacher_model = None - if model_args.teacher_model_name_or_path is not None: + if model_args.distill_teacher is not None: teacher_model = AutoModelForQuestionAnswering.from_pretrained( - model_args.teacher_model_name_or_path, - from_tf=bool(".ckpt" in model_args.teacher_model_name_or_path), + model_args.distill_teacher, + from_tf=bool(".ckpt" in model_args.distill_teacher), config=config, cache_dir=model_args.cache_dir, ) @@ -575,7 +576,7 @@ def compute_metrics(p: EvalPrediction): data_args.recipe, teacher=teacher_model, distill_hardness=model_args.distill_hardness, - temperature=model_args.temperature, + distill_temperature=model_args.distill_temperature, model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, diff --git a/examples/pytorch/question-answering/sparseml_utils.py b/examples/pytorch/question-answering/sparseml_utils.py index 8e0aad3b52e4..ca30f7c61954 100644 --- a/examples/pytorch/question-answering/sparseml_utils.py +++ b/examples/pytorch/question-answering/sparseml_utils.py @@ -16,16 +16,16 @@ class SparseMLQATrainer(QuestionAnsweringTrainer): :param recipe: recipe for model sparsification :param teacher: teacher model for distillation :param distill_hardness: ratio of loss by teacher targets (between 0 and 1) - :param temperature: temperature for distillation + :param distill_temperature: temperature for distillation :param args, kwargs: arguments passed into parent class """ - def __init__(self, recipe, teacher=None, distill_hardness=0.5, temperature=2.0, *args, **kwargs): + def __init__(self, recipe, teacher=None, distill_hardness=0.5, distill_temperature=2.0, *args, **kwargs): super().__init__(*args, **kwargs) self.recipe = recipe self.teacher = teacher self.distill_hardness = distill_hardness - self.temperature = temperature + self.distill_temperature = distill_temperature self.criterion = torch.nn.CrossEntropyLoss() self.manager = None @@ -85,19 +85,19 @@ def compute_loss(self, model, inputs, return_outputs=False): end_logits_teacher = teacher_output["end_logits"] loss_start = ( F.kl_div( - input=F.log_softmax(start_logits_student / self.temperature, dim=-1), - target=F.softmax(start_logits_teacher / self.temperature, dim=-1), + input=F.log_softmax(start_logits_student / self.distill_temperature, dim=-1), + target=F.softmax(start_logits_teacher / self.distill_temperature, dim=-1), reduction="batchmean", ) - * (self.temperature ** 2) + * (self.distill_temperature ** 2) ) loss_end = ( F.kl_div( - input=F.log_softmax(end_logits_student / self.temperature, dim=-1), - target=F.softmax(end_logits_teacher / self.temperature, dim=-1), + input=F.log_softmax(end_logits_student / self.distill_temperature, dim=-1), + target=F.softmax(end_logits_teacher / self.distill_temperature, dim=-1), reduction="batchmean", ) - * (self.temperature ** 2) + * (self.distill_temperature ** 2) ) teacher_loss = (loss_start + loss_end) / 2.0 loss_start = self.criterion(start_logits_student, start_logits_label)