From 939e389b3c2f12e65c1e0467e21908080f8b9759 Mon Sep 17 00:00:00 2001
From: Tuan Nguyen <tuan@neuralmagic.com>
Date: Thu, 3 Jun 2021 01:56:04 -0400
Subject: [PATCH 1/7] Initial commit: QA and Distill QA with SparseML integ

---
 .../recipes/finetune_squad_2epochs.yaml       |   6 +
 ..._80blocksparse_freq0.01_18prune10fine.yaml | 871 ++++++++++++++++++
 .../question-answering/run_distill_qa.py      | 791 ++++++++++++++++
 examples/pytorch/question-answering/run_qa.py | 133 ++-
 .../question-answering/sparseml_utils.py      | 195 ++++
 5 files changed, 1993 insertions(+), 3 deletions(-)
 create mode 100644 examples/pytorch/question-answering/recipes/finetune_squad_2epochs.yaml
 create mode 100644 examples/pytorch/question-answering/recipes/uni_80blocksparse_freq0.01_18prune10fine.yaml
 create mode 100755 examples/pytorch/question-answering/run_distill_qa.py
 create mode 100644 examples/pytorch/question-answering/sparseml_utils.py

diff --git a/examples/pytorch/question-answering/recipes/finetune_squad_2epochs.yaml b/examples/pytorch/question-answering/recipes/finetune_squad_2epochs.yaml
new file mode 100644
index 000000000000..8529afa2f4f6
--- /dev/null
+++ b/examples/pytorch/question-answering/recipes/finetune_squad_2epochs.yaml
@@ -0,0 +1,6 @@
+version: 1.1.0
+
+modifiers:
+    - !EpochRangeModifier
+        end_epoch: 2
+        start_epoch: 0.0
\ No newline at end of file
diff --git a/examples/pytorch/question-answering/recipes/uni_80blocksparse_freq0.01_18prune10fine.yaml b/examples/pytorch/question-answering/recipes/uni_80blocksparse_freq0.01_18prune10fine.yaml
new file mode 100644
index 000000000000..1673f5441e49
--- /dev/null
+++ b/examples/pytorch/question-answering/recipes/uni_80blocksparse_freq0.01_18prune10fine.yaml
@@ -0,0 +1,871 @@
+version: 1.1.0
+
+modifiers:
+    - !EpochRangeModifier
+        end_epoch: 30
+        start_epoch: 0.0
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.0.attention.self.query.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.0.attention.self.key.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.0.attention.self.value.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.0.attention.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.0.intermediate.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.0.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.1.attention.self.query.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.1.attention.self.key.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.1.attention.self.value.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.1.attention.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.1.intermediate.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.1.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.2.attention.self.query.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.2.attention.self.key.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.2.attention.self.value.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.2.attention.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.2.intermediate.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.2.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.3.attention.self.query.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.3.attention.self.key.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.3.attention.self.value.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.3.attention.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.3.intermediate.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.3.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.4.attention.self.query.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.4.attention.self.key.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.4.attention.self.value.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.4.attention.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.4.intermediate.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.4.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.5.attention.self.query.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.5.attention.self.key.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.5.attention.self.value.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.5.attention.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.5.intermediate.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.5.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.6.attention.self.query.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.6.attention.self.key.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.6.attention.self.value.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.6.attention.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.6.intermediate.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.6.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.7.attention.self.query.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.7.attention.self.key.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.7.attention.self.value.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.7.attention.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.7.intermediate.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.7.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.8.attention.self.query.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.8.attention.self.key.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.8.attention.self.value.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.8.attention.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.8.intermediate.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.8.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.9.attention.self.query.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.9.attention.self.key.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.9.attention.self.value.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.9.attention.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.9.intermediate.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.9.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.10.attention.self.query.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.10.attention.self.key.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.10.attention.self.value.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.10.attention.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.10.intermediate.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.10.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.11.attention.self.query.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.11.attention.self.key.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.11.attention.self.value.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.11.attention.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.11.intermediate.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
+    - !GMPruningModifier
+        end_epoch: 20
+        final_sparsity: 0.80
+        init_sparsity: 0.00
+        inter_func: cubic
+        leave_enabled: True
+        log_types: __ALL__
+        mask_type: [1,4]
+        params: ['bert.encoder.layer.11.output.dense.weight']
+        start_epoch: 2
+        update_frequency: 0.01
+
diff --git a/examples/pytorch/question-answering/run_distill_qa.py b/examples/pytorch/question-answering/run_distill_qa.py
new file mode 100755
index 000000000000..e1009ee67ec4
--- /dev/null
+++ b/examples/pytorch/question-answering/run_distill_qa.py
@@ -0,0 +1,791 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Example script for integrating spaseml with the transformers library to perform model distillation.
+This script is addopted from hugging face's implementation for Question Answering on the SQUAD Dataset.
+Hugging Face's original implementation is regularly updated and can be found at https://github.com/huggingface/transformers/blob/master/examples/question-answering/run_qa.py
+This script will:
+- Load transformer based models
+- Load a sparseml training and pruning optimizer
+- Train on SQUAD
+- Evaluate on SQUAD
+- Export model to onnx.
+##########
+Command help:
+usage: run_distill_qa.py [-h] \
+    [--teacher_model_name_or_path] \
+    [--student_model_name_or_path] \
+    [--temperature] \
+    [--distill_hardness] \
+    [--dataset_name]  \
+    [--num_train_epochs] \
+    [--do_train] \
+    [--do_eval] \
+    [--per_device_train_batch_size] \
+    [--per_device_eval_batch_size] \
+    [--learning_rate]\
+    [--max_seq_length]\
+    [--doc_stride]\
+    [--output_dir] \
+    [--overwrite_output_dir] \
+    [--cache_dir]\
+    [--preprocessing_num_workers] \
+    [--seed] 42 \
+    [--nm_prune_config] \
+    [--do_onnx_export] \
+    [--onnx_export_path] \
+    [--layers_to_keep] \
+
+Train, prune, and evaluate a transformer base question answering model on squad.
+    -h, --help            show this help message and exit
+    --teacher_model_name_or_path    The name or path of model which will be used for distilation.
+                                    Note, this model needs to be trained for QA task already.
+    --student_model_name_or_path    The path to the transformers model you wish to train
+                                    or the name of the pretrained language model you wish
+                                    to use. ex: bert-base-uncased.
+    --temperature                   Hyperparameter which controls model distilation
+    --distill_hardness              Hyperparameter which controls how much of the loss comes from teacher vs training labels
+    --model_name_or_path            The path to the transformers model you wish to train
+    --temperature                   Hyperparameter which controls model distilation 
+    --distill_hardness              Hyperparameter which controls how much of the loss comes from teacher vs training labels
+    --dataset_name                  The name of which dataset you want to use to train or
+                                    your model. ex: squad for using SQuAD.
+    --num_train_epochs              Paramater to control how many training epochs you wish
+                                    your model to train.
+    --do_train                      Boolean denoting if the model should be trained
+                                    or not. Default is false.
+    --do_eval                       Boolean denoting if the model should be evaluated
+                                    or not. Default is false.
+    --per_device_train_batch_size   Size of each training batch based on samples per GPU.
+                                    12 will fit in a 11gb GPU, 16 in a 16gb.
+    --per_device_eval_batch_size    Size of each training batch based on samples per GPU.
+                                    12 will fit in a 11gb GPU, 16 in a 16gb.
+    --learning_rate                 Learning rate initial float value. ex: 3e-5.
+    --max_seq_length                Int for the max sequence length to be parsed as a context
+                                    window. ex: 384 tokens.
+    --output_dir                    Path which model checkpoints and paths should be saved.
+    --overwrite_output_dir          Boolean to define if the
+    --cache_dir                     Directiory which cached transformer files(datasets, models
+                                    , tokenizers) are saved for fast loading.
+    --preprocessing_num_workers     The amount of cpu workers which are used to process datasets
+    --seed                          Int which determines what random seed is for training/shuffling
+    --nm_prune_config               Path to the neural magic prune configuration file. examples can
+                                    be found in prune_config_files but are customized for bert-base-uncased.
+    --do_onnx_export                Boolean denoting if the model should be exported to onnx
+    --onnx_export_path              Path where onnx model path will be exported. ex: onnx-export
+    --layers_to_keep                Number of layers to keep from original model. Layers are dropped before training
+
+##########
+Example command for training a 95% sparse BERT SQUAD model for 1 epoch with a unpruned teacher:
+python run_distill_qa.py \
+    --teacher_model_name_or_path models/neuralmagic-bert-squad-12layer-0sparse
+    --student_model_name_or_path bert-base-uncased \
+    --dataset_name squad \
+    --num_train_epochs 1 \
+    --do_train \
+    --do_eval \
+    --per_device_train_batch_size 12 \
+    --per_device_eval_batch_size 12 \
+    --learning_rate 3e-5 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --output_dir 95sparsity1epoch/ \
+    --overwrite_output_dir \
+    --cache_dir cache \
+    --preprocessing_num_workers 8 \
+    --seed 42 \
+    --nm_prune_config prune_config_files/95sparsity1epoch.yaml \
+    --do_onnx_export \
+    --onnx_export_path 95sparsity1epoch/ \
+    --distill_hardness 0.5 \
+    --temperature 2.0 \
+"""
+# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
+
+import logging
+import os
+import numpy as np
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+from datasets import load_dataset, load_metric
+
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoModelForQuestionAnswering,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    HfArgumentParser,
+    PreTrainedTokenizerFast,
+    TrainingArguments,
+    default_data_collator,
+    set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version
+from utils_qa import postprocess_qa_predictions
+
+# Start SparseML integration
+from sparseml_utils import SparseMLDistillQATrainer, convert_example_to_features
+from sparseml.pytorch.utils import ModuleExporter
+# End SparseML integration
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.7.0.dev0")
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    teacher_model_name_or_path: Optional[str] = field(
+        default="spacemanidol/neuralmagic-bert-squad-12layer-0sparse", metadata={"help": "Teacher model which needs to be a trained QA model"}
+    )
+    student_model_name_or_path: Optional[str] = field(
+        default="bert-base-uncased", metadata={"help": "Student model"}
+    )
+    temperature: Optional[float] = field(
+        default=2.0, metadata={"help": "Temperature applied to teacher softmax for distillation."}
+    )
+    distill_hardness: Optional[float] = field(
+        default=1.0, metadata={"help": "Proportion of loss coming from teacher model."}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    ####################################################################################
+    # Start SparseML Integration
+    ####################################################################################
+    nm_prune_config: Optional[str] = field(
+        default="recipes/noprune1epoch.yaml",
+        metadata={"help": "The input file name for the Neural Magic pruning config"},
+    )
+    do_onnx_export: bool = field(default=True, metadata={"help": "Export model to onnx"})
+    onnx_export_path: Optional[str] = field(
+        default="onnx-export", metadata={"help": "The filename and path which will be where onnx model is outputed"}
+    )
+    ####################################################################################
+    # End SparseML Integration
+    ####################################################################################
+
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_seq_length: int = field(
+        default=384,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can "
+            "be faster on GPU but will be slower on TPU)."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+            "value if set."
+        },
+    )
+    version_2_with_negative: bool = field(
+        default=False, metadata={"help": "If true, some of the examples do not have an answer."}
+    )
+    null_score_diff_threshold: float = field(
+        default=0.0,
+        metadata={
+            "help": "The threshold used to select the null answer: if the best answer has a score that is less than "
+            "the score of the null answer minus this threshold, the null answer is selected for this example. "
+            "Only useful when `version_2_with_negative=True`."
+        },
+    )
+    doc_stride: int = field(
+        default=128,
+        metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
+    )
+    n_best_size: int = field(
+        default=20,
+        metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
+    )
+    max_answer_length: int = field(
+        default=30,
+        metadata={
+            "help": "The maximum length of an answer that can be generated. This is needed because the start "
+            "and end predictions are not conditioned on one another."
+        },
+    )
+
+    def __post_init__(self):
+        if (
+            self.dataset_name is None
+            and self.train_file is None
+            and self.validation_file is None
+            and self.test_file is None
+        ):
+            raise ValueError("Need either a dataset name or a training/validation file/test_file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+            if self.test_file is not None:
+                extension = self.test_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if training_args.should_log:
+        transformers.utils.logging.set_verbosity_info()
+        transformers.utils.logging.enable_default_handler()
+        transformers.utils.logging.enable_explicit_format()
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
+
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
+        datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.student_model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.student_model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=True,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    student_model = AutoModelForQuestionAnswering.from_pretrained(
+        model_args.student_model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.student_model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    teacher_model = None
+    if model_args.teacher_model_name_or_path != None:
+        teacher_model = AutoModelForQuestionAnswering.from_pretrained(
+            model_args.teacher_model_name_or_path,
+            from_tf=bool(".ckpt" in model_args.teacher_model_name_or_path),
+            config=config,
+            cache_dir=model_args.cache_dir,
+        )
+        teacher_model_parameters = filter(lambda p: p.requires_grad, teacher_model.parameters())
+        params = sum([np.prod(p.size()) for p in teacher_model_parameters])
+        logger.info("Teacher Model has %s parameters", params)   
+
+    student_model_parameters = filter(lambda p: p.requires_grad, student_model.parameters())
+    params = sum([np.prod(p.size()) for p in student_model_parameters])
+    logger.info("Student Model has %s parameters", params)
+
+    # Tokenizer check: this script requires a fast tokenizer.
+    if not isinstance(tokenizer, PreTrainedTokenizerFast):
+        raise ValueError(
+            "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
+            "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this "
+            "requirement"
+        )
+
+    # Preprocessing the datasets.
+    # Preprocessing is slighlty different for training and evaluation.
+    if training_args.do_train:
+        column_names = datasets["train"].column_names
+    elif training_args.do_eval:
+        column_names = datasets["validation"].column_names
+    else:
+        column_names = datasets["test"].column_names
+    question_column_name = "question" if "question" in column_names else column_names[0]
+    context_column_name = "context" if "context" in column_names else column_names[1]
+    answer_column_name = "answers" if "answers" in column_names else column_names[2]
+
+    # Padding side determines if we do (question|context) or (context|question).
+    pad_on_right = tokenizer.padding_side == "right"
+
+    if data_args.max_seq_length > tokenizer.model_max_length:
+        logger.warning(
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+        )
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+    # Training preprocessing
+    def prepare_train_features(examples):
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=data_args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length" if data_args.pad_to_max_length else False,
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        # The offset mappings will give us a map from token to character position in the original context. This will
+        # help us compute the start_positions and end_positions.
+        offset_mapping = tokenized_examples.pop("offset_mapping")
+
+        # Let's label those examples!
+        tokenized_examples["start_positions"] = []
+        tokenized_examples["end_positions"] = []
+
+        for i, offsets in enumerate(offset_mapping):
+            # We will label impossible answers with the index of the CLS token.
+            input_ids = tokenized_examples["input_ids"][i]
+            cls_index = input_ids.index(tokenizer.cls_token_id)
+
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            answers = examples[answer_column_name][sample_index]
+            # If no answers are given, set the cls_index as answer.
+            if len(answers["answer_start"]) == 0:
+                tokenized_examples["start_positions"].append(cls_index)
+                tokenized_examples["end_positions"].append(cls_index)
+            else:
+                # Start/end character index of the answer in the text.
+                start_char = answers["answer_start"][0]
+                end_char = start_char + len(answers["text"][0])
+
+                # Start token index of the current span in the text.
+                token_start_index = 0
+                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
+                    token_start_index += 1
+
+                # End token index of the current span in the text.
+                token_end_index = len(input_ids) - 1
+                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
+                    token_end_index -= 1
+
+                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
+                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
+                    tokenized_examples["start_positions"].append(cls_index)
+                    tokenized_examples["end_positions"].append(cls_index)
+                else:
+                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
+                    # Note: we could go after the last offset if the answer is the last word (edge case).
+                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
+                        token_start_index += 1
+                    tokenized_examples["start_positions"].append(token_start_index - 1)
+                    while offsets[token_end_index][1] >= end_char:
+                        token_end_index -= 1
+                    tokenized_examples["end_positions"].append(token_end_index + 1)
+
+        return tokenized_examples
+
+    if training_args.do_train:
+        if "train" not in datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = datasets["train"]
+        if data_args.max_train_samples is not None:
+            # We will select sample from whole data if agument is specified
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+        # Create train feature from dataset
+        train_dataset = train_dataset.map(
+            prepare_train_features,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        if data_args.max_train_samples is not None:
+            # Number of samples might increase during Feature Creation, We select only specified max samples
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+
+    # Validation preprocessing
+    def prepare_validation_features(examples):
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=data_args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length" if data_args.pad_to_max_length else False,
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+
+        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+        # corresponding example_id and we will store the offset mappings.
+        tokenized_examples["example_id"] = []
+
+        for i in range(len(tokenized_examples["input_ids"])):
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            context_index = 1 if pad_on_right else 0
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+            # position is part of the context or not.
+            tokenized_examples["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_index else None)
+                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+            ]
+
+        return tokenized_examples
+
+    if training_args.do_eval:
+        if "validation" not in datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_examples = datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            # We will select sample from whole data
+            eval_examples = eval_examples.select(range(data_args.max_eval_samples))
+        # Validation Feature Creation
+        eval_dataset = eval_examples.map(
+            prepare_validation_features,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        if data_args.max_eval_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+
+    if training_args.do_predict:
+        if "test" not in datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        predict_examples = datasets["test"]
+        if data_args.max_predict_samples is not None:
+            # We will select sample from whole data
+            predict_examples = predict_examples.select(range(data_args.max_predict_samples))
+        # Predict Feature Creation
+        predict_dataset = predict_examples.map(
+            prepare_validation_features,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        if data_args.max_predict_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
+
+    # Data collator
+    # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
+    # collator.
+    data_collator = (
+        default_data_collator
+        if data_args.pad_to_max_length
+        else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
+    )
+
+    # Post-processing:
+    def post_processing_function(examples, features, predictions, stage="eval"):
+        # Post-processing: we match the start logits and end logits to answers in the original context.
+        predictions = postprocess_qa_predictions(
+            examples=examples,
+            features=features,
+            predictions=predictions,
+            version_2_with_negative=data_args.version_2_with_negative,
+            n_best_size=data_args.n_best_size,
+            max_answer_length=data_args.max_answer_length,
+            null_score_diff_threshold=data_args.null_score_diff_threshold,
+            output_dir=training_args.output_dir,
+            is_world_process_zero=trainer.is_world_process_zero(),
+            prefix=stage,
+        )
+        # Format the result to the format the metric expects.
+        if data_args.version_2_with_negative:
+            formatted_predictions = [
+                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
+            ]
+        else:
+            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+
+        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
+        return EvalPrediction(predictions=formatted_predictions, label_ids=references)
+
+    metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
+
+    def compute_metrics(p: EvalPrediction):
+        return metric.compute(predictions=p.predictions, references=p.label_ids)
+
+    ####################################################################################
+    # Start SparseML Integration
+    ####################################################################################
+    # Initialize our Trainer
+    trainer = SparseMLDistillQATrainer(
+        data_args.nm_prune_config,
+        teacher=teacher_model,
+        distill_hardness = model_args.distill_hardness,
+        temperature = model_args.temperature,
+        model=student_model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        eval_examples=eval_examples if training_args.do_eval else None,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        post_process_function=post_processing_function,
+        compute_metrics=compute_metrics,
+    )
+    ####################################################################################
+    # End SparseML Integration
+    ####################################################################################
+
+    # Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Prediction
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+        results = trainer.predict(predict_dataset, predict_examples)
+        metrics = results.metrics
+
+        max_predict_samples = (
+            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
+        )
+        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
+
+        trainer.log_metrics("predict", metrics)
+        trainer.save_metrics("predict", metrics)
+
+    if training_args.push_to_hub:
+        kwargs = {"finetuned_from": model_args.model_name_or_path, "tags": "question-answering"}
+        if data_args.dataset_name is not None:
+            kwargs["dataset_tags"] = data_args.dataset_name
+            if data_args.dataset_config_name is not None:
+                kwargs["dataset_args"] = data_args.dataset_config_name
+                kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+            else:
+                kwargs["dataset"] = data_args.dataset_name
+
+        trainer.push_to_hub(**kwargs)
+
+    ####################################################################################
+    # Start SparseML Integration
+    ####################################################################################
+    if data_args.do_onnx_export:
+        logger.info("*** Export to ONNX ***")
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        exporter = ModuleExporter(
+            student_model, output_dir=data_args.onnx_export_path
+        )
+        sample_batch = convert_example_to_features(
+            datasets["validation"][0],
+            tokenizer,
+            data_args.max_seq_length,
+            data_args.doc_stride
+        )
+        exporter.export_onnx(sample_batch=sample_batch, convert_qat=True)
+    ####################################################################################
+    # End SparseML Integration
+    ####################################################################################
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index 27155208be5f..009613b38a46 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -14,7 +14,89 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Fine-tuning the library models for question answering.
+Example script for integrating spaseml with the transformers library.
+This script is addopted from hugging face's implementation for Question Answering on the SQUAD Dataset.
+Hugging Face's original implementation is regularly updated and can be found at https://github.com/huggingface/transformers/blob/master/examples/question-answering/run_qa.py
+This script will:
+- Load transformer based modesl
+- Load a sparseml training and pruning optimizer
+- Train on SQUAD
+- Evaluate on SQUAD
+- Export model to onnx.
+##########
+Command help:
+usage: run_qa.py [-h] \
+    --model_name_or_path MODEL \
+    [--dataset_name]  \
+    [--num_train_epochs] \
+    [--do_train] \
+    [--do_eval] \
+    [--per_device_train_batch_size] \
+    [--per_device_eval_batch_size] \
+    [--learning_rate]\
+    [--max_seq_length]\
+    [--doc_stride]\
+    [--output_dir] \
+    [--overwrite_output_dir] \
+    [--cache_dir]\
+    [--preprocessing_num_workers] \
+    [--seed] 42 \
+    [--nm_prune_config]
+    [--do_onnx_export]
+    [--onnx_export_path]
+
+Train, prune, and evaluate a transformer base question answering model on squad.
+    -h, --help            show this help message and exit
+    --model_name_or_path MODEL      The path to the transformers model you wish to train
+                                    or the name of the pretrained language model you wish
+                                    to use. ex: bert-base-uncased.
+    --dataset_name                  The name of which dataset you want to use to train or
+                                    your model. ex: squad for using SQuAD.
+    --num_train_epochs              Paramater to control how many training epochs you wish
+                                    your model to train.
+    --do_train                      Boolean denoting if the model should be trained
+                                    or not. Default is false.
+    --do_eval                       Boolean denoting if the model should be evaluated
+                                    or not. Default is false.
+    --per_device_train_batch_size   Size of each training batch based on samples per GPU.
+                                    12 will fit in a 11gb GPU, 16 in a 16gb.
+    --per_device_eval_batch_size    Size of each training batch based on samples per GPU.
+                                    12 will fit in a 11gb GPU, 16 in a 16gb.
+    --learning_rate                 Learning rate initial float value. ex: 3e-5.
+    --max_seq_length                Int for the max sequence length to be parsed as a context
+                                    window. ex: 384 tokens.
+    --output_dir                    Path which model checkpoints and paths should be saved.
+    --overwrite_output_dir          Boolean to define if the
+    --cache_dir                     Directiory which cached transformer files(datasets, models
+                                    , tokenizers) are saved for fast loading.
+    --preprocessing_num_workers     The amount of cpu workers which are used to process datasets
+    --seed                          Int which determines what random seed is for training/shuffling
+    --nm_prune_config               Path to the neural magic prune configuration file. examples can
+                                    be found in prune_config_files but are customized for bert-base-uncased.
+    --do_onnx_export                Boolean denoting if the model should be exported to onnx
+    --onnx_export_path              Path where onnx model path will be exported. ex: onnx-export
+
+##########
+Example command for training a 95% sparse BERT SQUAD model for 1 epoch:
+python run_qa.py \
+    --model_name_or_path bert-base-uncased \
+    --dataset_name squad \
+    --num_train_epochs 1 \
+    --do_train \
+    --do_eval \
+    --per_device_train_batch_size 12 \
+    --per_device_eval_batch_size 12 \
+    --learning_rate 3e-5 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --output_dir 95sparsity1epoch/ \
+    --overwrite_output_dir \
+    --cache_dir cache \
+    --preprocessing_num_workers 8 \
+    --seed 42 \
+    --nm_prune_config prune_config_files/95sparsity1epoch.yaml \
+    --do_onnx_export \
+    --onnx_export_path 95sparsity1epoch/
 """
 # You can also adapt this script on your own question answering task. Pointers for this are left as comments.
 
@@ -27,7 +109,6 @@
 from datasets import load_dataset, load_metric
 
 import transformers
-from trainer_qa import QuestionAnsweringTrainer
 from transformers import (
     AutoConfig,
     AutoModelForQuestionAnswering,
@@ -44,6 +125,11 @@
 from transformers.utils import check_min_version
 from utils_qa import postprocess_qa_predictions
 
+# Start SparseML integration
+from sparseml_utils import SparseMLQATrainer, convert_example_to_features
+from sparseml.pytorch.utils import ModuleExporter
+# End SparseML integration
+
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 check_min_version("4.7.0.dev0")
@@ -89,6 +175,21 @@ class DataTrainingArguments:
     Arguments pertaining to what data we are going to input our model for training and eval.
     """
 
+    ####################################################################################
+    # Start SparseML Integration
+    ####################################################################################
+    nm_prune_config: Optional[str] = field(
+        default="recipes/noprune1epoch.yaml",
+        metadata={"help": "The input file name for the Neural Magic pruning config"},
+    )
+    do_onnx_export: bool = field(default=True, metadata={"help": "Export model to onnx"})
+    onnx_export_path: Optional[str] = field(
+        default="onnx-export", metadata={"help": "The filename and path which will be where onnx model is outputed"}
+    )
+    ####################################################################################
+    # End SparseML Integration
+    ####################################################################################
+
     dataset_name: Optional[str] = field(
         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
     )
@@ -542,8 +643,12 @@ def post_processing_function(examples, features, predictions, stage="eval"):
     def compute_metrics(p: EvalPrediction):
         return metric.compute(predictions=p.predictions, references=p.label_ids)
 
+    ####################################################################################
+    # Start SparseML Integration
+    ####################################################################################
     # Initialize our Trainer
-    trainer = QuestionAnsweringTrainer(
+    trainer = SparseMLQATrainer(
+        data_args.nm_prune_config,
         model=model,
         args=training_args,
         train_dataset=train_dataset if training_args.do_train else None,
@@ -554,6 +659,9 @@ def compute_metrics(p: EvalPrediction):
         post_process_function=post_processing_function,
         compute_metrics=compute_metrics,
     )
+    ####################################################################################
+    # End SparseML Integration
+    ####################################################################################
 
     # Training
     if training_args.do_train:
@@ -612,6 +720,25 @@ def compute_metrics(p: EvalPrediction):
 
         trainer.push_to_hub(**kwargs)
 
+    ####################################################################################
+    # Start SparseML Integration
+    ####################################################################################
+    if data_args.do_onnx_export:
+        logger.info("*** Export to ONNX ***")
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        exporter = ModuleExporter(
+            model, output_dir=data_args.onnx_export_path
+        )
+        sample_batch = convert_example_to_features(
+            datasets["validation"][0],
+            tokenizer,
+            data_args.max_seq_length,
+            data_args.doc_stride
+        )
+        exporter.export_onnx(sample_batch=sample_batch, convert_qat=True)
+    ####################################################################################
+    # End SparseML Integration
+    ####################################################################################
 
 def _mp_fn(index):
     # For xla_spawn (TPUs)
diff --git a/examples/pytorch/question-answering/sparseml_utils.py b/examples/pytorch/question-answering/sparseml_utils.py
new file mode 100644
index 000000000000..48951b6f5f59
--- /dev/null
+++ b/examples/pytorch/question-answering/sparseml_utils.py
@@ -0,0 +1,195 @@
+import collections
+import math
+import torch
+import torch.nn.functional as F
+import numpy
+from trainer_qa import QuestionAnsweringTrainer
+
+from sparseml.pytorch.optim.manager import ScheduledModifierManager
+from sparseml.pytorch.optim.optimizer import ScheduledOptimizer
+
+
+class SparseMLQATrainer(QuestionAnsweringTrainer):
+    """
+    Question Answering trainer with customized optimizer using SparseML
+
+    :param nm_prune_config: recipe for model sparsification
+    :param args, kwargs: arguments passed into parent class
+    """
+    def __init__(self, nm_prune_config, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.nm_prune_config = nm_prune_config
+
+    def create_optimizer(self):
+        """
+        Create optimizer customized using SparseML
+        """
+        super().create_optimizer()
+        steps_per_epoch = math.ceil(len(self.train_dataset) / (self.args.per_device_train_batch_size * self.args._n_gpu))
+        manager = ScheduledModifierManager.from_yaml(self.nm_prune_config)
+        self.args.num_train_epochs = float(manager.max_epochs)
+        self.optimizer = ScheduledOptimizer(self.optimizer, self.model, manager, steps_per_epoch=steps_per_epoch, loggers=None)
+
+
+class SparseMLDistillQATrainer(SparseMLQATrainer):
+    """
+    Question Answering trainer using distilation with customized optimizer using SparseML
+
+    :param nm_prune_config: recipe for model sparsification
+    :param teacher: teacher model
+    :param distill_hardness: weight of the teacher loss
+    :param temperature: temperature used for loss
+    :param args, kwargs: arguments passed into parent class
+    """
+    def __init__(self, nm_prune_config, teacher=None, distill_hardness=0.5, temperature=2.0, *args, **kwargs):
+        super().__init__(nm_prune_config, *args, **kwargs)
+        self.teacher = teacher
+        self.distill_hardness = distill_hardness
+        self.temperature = temperature
+        self.criterion = torch.nn.CrossEntropyLoss()
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        """
+        Computing loss using teacher/student distillation
+        """
+        outputs = model(**inputs)
+        loss = outputs['loss']
+        if self.teacher is not None:
+            input_device = inputs["input_ids"].device
+            self.teacher = self.teacher.to(input_device)
+            start_logits_student = outputs["start_logits"]
+            end_logits_student = outputs["end_logits"]
+            start_logits_label = inputs["start_positions"]
+            end_logits_label = inputs["start_positions"]
+            with torch.no_grad():
+                teacher_output = self.teacher(
+                                input_ids=inputs["input_ids"],
+                                token_type_ids=inputs["token_type_ids"],
+                                attention_mask=inputs["attention_mask"],
+                            )
+            start_logits_teacher = teacher_output["start_logits"]
+            end_logits_teacher = teacher_output["end_logits"]
+            loss_start = (
+                F.kl_div(
+                    input=F.log_softmax(start_logits_student / self.temperature, dim=-1),
+                    target=F.softmax(start_logits_teacher / self.temperature, dim=-1),
+                    reduction="batchmean",
+                )
+                * (self.temperature ** 2)
+            )
+            loss_end = (
+                F.kl_div(
+                    input=F.log_softmax(end_logits_student / self.temperature, dim=-1),
+                    target=F.softmax(end_logits_teacher / self.temperature, dim=-1),
+                    reduction="batchmean",
+                )
+                * (self.temperature ** 2)
+            )
+            teacher_loss = (loss_start + loss_end) / 2.0
+            loss_start = self.criterion(start_logits_student, start_logits_label)
+            loss_end = self.criterion(end_logits_student, end_logits_label)
+            label_loss = (loss_start + loss_end) / 2.0
+            loss = ((1-self.distill_hardness) * label_loss) + (self.distill_hardness * teacher_loss)
+        return (loss, outputs) if return_outputs else loss 
+
+
+def convert_example_to_features(example, tokenizer, max_seq_length, doc_stride, max_query_length=30):
+    """
+    Convert example to features, used for onnx export
+    """
+    Feature = collections.namedtuple(
+        "Feature",
+        [
+            "unique_id",
+            "tokens",
+            "example_index",
+            "token_to_orig_map",
+            "token_is_max_context",
+        ],
+    )
+    extra = []
+    unique_id = 0
+    query_tokens = tokenizer.tokenize(example["question"])[0:max_query_length]
+    tok_to_orig_index = []
+    orig_to_tok_index = []
+    all_doc_tokens = []
+    for (i, token) in enumerate(example["context"]):
+        orig_to_tok_index.append(len(all_doc_tokens))
+        sub_tokens = tokenizer.tokenize(token)
+        for sub_token in sub_tokens:
+            tok_to_orig_index.append(i)
+            all_doc_tokens.append(sub_token)
+    max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+    _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
+    doc_spans = []
+    start_offset = 0
+    while start_offset < len(all_doc_tokens):
+        length = len(all_doc_tokens) - start_offset
+        if length > max_tokens_for_doc:
+            length = max_tokens_for_doc
+        doc_spans.append(_DocSpan(start=start_offset, length=length))
+        if start_offset + length == len(all_doc_tokens):
+            break
+        start_offset += min(length, doc_stride)
+    for (doc_span_index, doc_span) in enumerate(doc_spans):
+        tokens = []
+        token_to_orig_map = {}
+        token_is_max_context = {}
+        segment_ids = []
+        tokens.append("[CLS]")
+        segment_ids.append(0)
+        for token in query_tokens:
+            tokens.append(token)
+            segment_ids.append(0)
+        tokens.append("[SEP]")
+        segment_ids.append(0)
+        for i in range(doc_span.length):
+            split_token_index = doc_span.start + i
+            token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+            is_max_context = _check_is_max_context(
+                doc_spans, doc_span_index, split_token_index
+            )
+            token_is_max_context[len(tokens)] = is_max_context
+            tokens.append(all_doc_tokens[split_token_index])
+            segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+        input_mask = [1] * len(input_ids)
+        while len(input_ids) < max_seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            segment_ids.append(0)
+        feature = Feature(
+            unique_id=unique_id,
+            tokens=tokens,
+            example_index=0,
+            token_to_orig_map=token_to_orig_map,
+            token_is_max_context=token_is_max_context,
+        )
+        extra.append(feature)
+        unique_id += 1
+        # extra is used as additional data but sparseml doesn't support it
+    return (
+        torch.from_numpy(numpy.array([numpy.array(input_ids, dtype=numpy.int64)])),
+        torch.from_numpy(numpy.array([numpy.array(input_mask, dtype=numpy.int64)])),
+        torch.from_numpy(numpy.array([numpy.array(segment_ids, dtype=numpy.int64)])),
+    )
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+    return cur_span_index == best_span_index

From ab5f8521b095ccca62b2fe4418ec9367fcc3a932 Mon Sep 17 00:00:00 2001
From: Tuan Nguyen <tuan@neuralmagic.com>
Date: Fri, 4 Jun 2021 07:25:45 -0400
Subject: [PATCH 2/7] Overwrite scaler's step if it exists (for amp mode)

---
 .../question-answering/sparseml_utils.py      | 34 ++++++++++++-------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/examples/pytorch/question-answering/sparseml_utils.py b/examples/pytorch/question-answering/sparseml_utils.py
index 48951b6f5f59..0702942c6c77 100644
--- a/examples/pytorch/question-answering/sparseml_utils.py
+++ b/examples/pytorch/question-answering/sparseml_utils.py
@@ -16,6 +16,7 @@ class SparseMLQATrainer(QuestionAnsweringTrainer):
     :param nm_prune_config: recipe for model sparsification
     :param args, kwargs: arguments passed into parent class
     """
+
     def __init__(self, nm_prune_config, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.nm_prune_config = nm_prune_config
@@ -25,10 +26,20 @@ def create_optimizer(self):
         Create optimizer customized using SparseML
         """
         super().create_optimizer()
-        steps_per_epoch = math.ceil(len(self.train_dataset) / (self.args.per_device_train_batch_size * self.args._n_gpu))
+        steps_per_epoch = math.ceil(
+            len(self.train_dataset) / (self.args.per_device_train_batch_size * self.args._n_gpu)
+        )
         manager = ScheduledModifierManager.from_yaml(self.nm_prune_config)
         self.args.num_train_epochs = float(manager.max_epochs)
-        self.optimizer = ScheduledOptimizer(self.optimizer, self.model, manager, steps_per_epoch=steps_per_epoch, loggers=None)
+        if hasattr(self, "scaler"):
+            manager.initialize(self.model, epoch=0.0)
+            self.scaler = manager.modify(
+                self.model, self.optimizer, steps_per_epoch=steps_per_epoch, wrap_optim=self.scaler
+            )
+        else:
+            self.optimizer = ScheduledOptimizer(
+                self.optimizer, self.model, manager, steps_per_epoch=steps_per_epoch, loggers=None
+            )
 
 
 class SparseMLDistillQATrainer(SparseMLQATrainer):
@@ -41,6 +52,7 @@ class SparseMLDistillQATrainer(SparseMLQATrainer):
     :param temperature: temperature used for loss
     :param args, kwargs: arguments passed into parent class
     """
+
     def __init__(self, nm_prune_config, teacher=None, distill_hardness=0.5, temperature=2.0, *args, **kwargs):
         super().__init__(nm_prune_config, *args, **kwargs)
         self.teacher = teacher
@@ -53,7 +65,7 @@ def compute_loss(self, model, inputs, return_outputs=False):
         Computing loss using teacher/student distillation
         """
         outputs = model(**inputs)
-        loss = outputs['loss']
+        loss = outputs["loss"]
         if self.teacher is not None:
             input_device = inputs["input_ids"].device
             self.teacher = self.teacher.to(input_device)
@@ -63,10 +75,10 @@ def compute_loss(self, model, inputs, return_outputs=False):
             end_logits_label = inputs["start_positions"]
             with torch.no_grad():
                 teacher_output = self.teacher(
-                                input_ids=inputs["input_ids"],
-                                token_type_ids=inputs["token_type_ids"],
-                                attention_mask=inputs["attention_mask"],
-                            )
+                    input_ids=inputs["input_ids"],
+                    token_type_ids=inputs["token_type_ids"],
+                    attention_mask=inputs["attention_mask"],
+                )
             start_logits_teacher = teacher_output["start_logits"]
             end_logits_teacher = teacher_output["end_logits"]
             loss_start = (
@@ -89,8 +101,8 @@ def compute_loss(self, model, inputs, return_outputs=False):
             loss_start = self.criterion(start_logits_student, start_logits_label)
             loss_end = self.criterion(end_logits_student, end_logits_label)
             label_loss = (loss_start + loss_end) / 2.0
-            loss = ((1-self.distill_hardness) * label_loss) + (self.distill_hardness * teacher_loss)
-        return (loss, outputs) if return_outputs else loss 
+            loss = ((1 - self.distill_hardness) * label_loss) + (self.distill_hardness * teacher_loss)
+        return (loss, outputs) if return_outputs else loss
 
 
 def convert_example_to_features(example, tokenizer, max_seq_length, doc_stride, max_query_length=30):
@@ -146,9 +158,7 @@ def convert_example_to_features(example, tokenizer, max_seq_length, doc_stride,
         for i in range(doc_span.length):
             split_token_index = doc_span.start + i
             token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
-            is_max_context = _check_is_max_context(
-                doc_spans, doc_span_index, split_token_index
-            )
+            is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index)
             token_is_max_context[len(tokens)] = is_max_context
             tokens.append(all_doc_tokens[split_token_index])
             segment_ids.append(1)

From f9c843b6f5ec58a279359ec023c70533f82224ed Mon Sep 17 00:00:00 2001
From: Tuan Nguyen <tuan@neuralmagic.com>
Date: Mon, 7 Jun 2021 11:21:07 -0400
Subject: [PATCH 3/7] Add wandb logger

---
 .../question-answering/sparseml_utils.py        | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/examples/pytorch/question-answering/sparseml_utils.py b/examples/pytorch/question-answering/sparseml_utils.py
index 0702942c6c77..f0c1f124a4ea 100644
--- a/examples/pytorch/question-answering/sparseml_utils.py
+++ b/examples/pytorch/question-answering/sparseml_utils.py
@@ -8,6 +8,8 @@
 from sparseml.pytorch.optim.manager import ScheduledModifierManager
 from sparseml.pytorch.optim.optimizer import ScheduledOptimizer
 
+from sparseml.pytorch.utils import logger
+
 
 class SparseMLQATrainer(QuestionAnsweringTrainer):
     """
@@ -20,6 +22,11 @@ class SparseMLQATrainer(QuestionAnsweringTrainer):
     def __init__(self, nm_prune_config, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.nm_prune_config = nm_prune_config
+        self.manager = None
+        loggers = []
+        if "wandb" in self.args.report_to:
+            loggers.append(logger.WANDBLogger())
+        self.loggers = loggers
 
     def create_optimizer(self):
         """
@@ -29,16 +36,16 @@ def create_optimizer(self):
         steps_per_epoch = math.ceil(
             len(self.train_dataset) / (self.args.per_device_train_batch_size * self.args._n_gpu)
         )
-        manager = ScheduledModifierManager.from_yaml(self.nm_prune_config)
-        self.args.num_train_epochs = float(manager.max_epochs)
+        self.manager = ScheduledModifierManager.from_yaml(self.nm_prune_config)
+        self.args.num_train_epochs = float(self.manager.max_epochs)
         if hasattr(self, "scaler"):
-            manager.initialize(self.model, epoch=0.0)
-            self.scaler = manager.modify(
+            self.manager.initialize(self.model, epoch=0.0, loggers=self.loggers)
+            self.scaler = self.manager.modify(
                 self.model, self.optimizer, steps_per_epoch=steps_per_epoch, wrap_optim=self.scaler
             )
         else:
             self.optimizer = ScheduledOptimizer(
-                self.optimizer, self.model, manager, steps_per_epoch=steps_per_epoch, loggers=None
+                self.optimizer, self.model, self.manager, steps_per_epoch=steps_per_epoch, loggers=self.loggers
             )
 
 

From 2d580c7060e145aa9a9e561bf21ef2e8b3e6196d Mon Sep 17 00:00:00 2001
From: Tuan Nguyen <tuan@neuralmagic.com>
Date: Thu, 10 Jun 2021 07:25:36 -0400
Subject: [PATCH 4/7] Remove distill script (to be unified with run_qa),
 recipes (to be moved to sparseml)

---
 .../recipes/finetune_squad_2epochs.yaml       |   6 -
 ..._80blocksparse_freq0.01_18prune10fine.yaml | 871 ------------------
 .../question-answering/run_distill_qa.py      | 791 ----------------
 3 files changed, 1668 deletions(-)
 delete mode 100644 examples/pytorch/question-answering/recipes/finetune_squad_2epochs.yaml
 delete mode 100644 examples/pytorch/question-answering/recipes/uni_80blocksparse_freq0.01_18prune10fine.yaml
 delete mode 100755 examples/pytorch/question-answering/run_distill_qa.py

diff --git a/examples/pytorch/question-answering/recipes/finetune_squad_2epochs.yaml b/examples/pytorch/question-answering/recipes/finetune_squad_2epochs.yaml
deleted file mode 100644
index 8529afa2f4f6..000000000000
--- a/examples/pytorch/question-answering/recipes/finetune_squad_2epochs.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-version: 1.1.0
-
-modifiers:
-    - !EpochRangeModifier
-        end_epoch: 2
-        start_epoch: 0.0
\ No newline at end of file
diff --git a/examples/pytorch/question-answering/recipes/uni_80blocksparse_freq0.01_18prune10fine.yaml b/examples/pytorch/question-answering/recipes/uni_80blocksparse_freq0.01_18prune10fine.yaml
deleted file mode 100644
index 1673f5441e49..000000000000
--- a/examples/pytorch/question-answering/recipes/uni_80blocksparse_freq0.01_18prune10fine.yaml
+++ /dev/null
@@ -1,871 +0,0 @@
-version: 1.1.0
-
-modifiers:
-    - !EpochRangeModifier
-        end_epoch: 30
-        start_epoch: 0.0
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.0.attention.self.query.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.0.attention.self.key.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.0.attention.self.value.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.0.attention.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.0.intermediate.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.0.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.1.attention.self.query.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.1.attention.self.key.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.1.attention.self.value.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.1.attention.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.1.intermediate.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.1.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.2.attention.self.query.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.2.attention.self.key.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.2.attention.self.value.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.2.attention.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.2.intermediate.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.2.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.3.attention.self.query.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.3.attention.self.key.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.3.attention.self.value.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.3.attention.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.3.intermediate.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.3.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.4.attention.self.query.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.4.attention.self.key.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.4.attention.self.value.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.4.attention.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.4.intermediate.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.4.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.5.attention.self.query.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.5.attention.self.key.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.5.attention.self.value.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.5.attention.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.5.intermediate.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.5.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.6.attention.self.query.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.6.attention.self.key.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.6.attention.self.value.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.6.attention.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.6.intermediate.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.6.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.7.attention.self.query.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.7.attention.self.key.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.7.attention.self.value.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.7.attention.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.7.intermediate.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.7.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.8.attention.self.query.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.8.attention.self.key.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.8.attention.self.value.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.8.attention.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.8.intermediate.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.8.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.9.attention.self.query.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.9.attention.self.key.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.9.attention.self.value.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.9.attention.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.9.intermediate.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.9.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.10.attention.self.query.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.10.attention.self.key.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.10.attention.self.value.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.10.attention.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.10.intermediate.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.10.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.11.attention.self.query.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.11.attention.self.key.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.11.attention.self.value.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.11.attention.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.11.intermediate.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
-    - !GMPruningModifier
-        end_epoch: 20
-        final_sparsity: 0.80
-        init_sparsity: 0.00
-        inter_func: cubic
-        leave_enabled: True
-        log_types: __ALL__
-        mask_type: [1,4]
-        params: ['bert.encoder.layer.11.output.dense.weight']
-        start_epoch: 2
-        update_frequency: 0.01
-
diff --git a/examples/pytorch/question-answering/run_distill_qa.py b/examples/pytorch/question-answering/run_distill_qa.py
deleted file mode 100755
index e1009ee67ec4..000000000000
--- a/examples/pytorch/question-answering/run_distill_qa.py
+++ /dev/null
@@ -1,791 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Example script for integrating spaseml with the transformers library to perform model distillation.
-This script is addopted from hugging face's implementation for Question Answering on the SQUAD Dataset.
-Hugging Face's original implementation is regularly updated and can be found at https://github.com/huggingface/transformers/blob/master/examples/question-answering/run_qa.py
-This script will:
-- Load transformer based models
-- Load a sparseml training and pruning optimizer
-- Train on SQUAD
-- Evaluate on SQUAD
-- Export model to onnx.
-##########
-Command help:
-usage: run_distill_qa.py [-h] \
-    [--teacher_model_name_or_path] \
-    [--student_model_name_or_path] \
-    [--temperature] \
-    [--distill_hardness] \
-    [--dataset_name]  \
-    [--num_train_epochs] \
-    [--do_train] \
-    [--do_eval] \
-    [--per_device_train_batch_size] \
-    [--per_device_eval_batch_size] \
-    [--learning_rate]\
-    [--max_seq_length]\
-    [--doc_stride]\
-    [--output_dir] \
-    [--overwrite_output_dir] \
-    [--cache_dir]\
-    [--preprocessing_num_workers] \
-    [--seed] 42 \
-    [--nm_prune_config] \
-    [--do_onnx_export] \
-    [--onnx_export_path] \
-    [--layers_to_keep] \
-
-Train, prune, and evaluate a transformer base question answering model on squad.
-    -h, --help            show this help message and exit
-    --teacher_model_name_or_path    The name or path of model which will be used for distilation.
-                                    Note, this model needs to be trained for QA task already.
-    --student_model_name_or_path    The path to the transformers model you wish to train
-                                    or the name of the pretrained language model you wish
-                                    to use. ex: bert-base-uncased.
-    --temperature                   Hyperparameter which controls model distilation
-    --distill_hardness              Hyperparameter which controls how much of the loss comes from teacher vs training labels
-    --model_name_or_path            The path to the transformers model you wish to train
-    --temperature                   Hyperparameter which controls model distilation 
-    --distill_hardness              Hyperparameter which controls how much of the loss comes from teacher vs training labels
-    --dataset_name                  The name of which dataset you want to use to train or
-                                    your model. ex: squad for using SQuAD.
-    --num_train_epochs              Paramater to control how many training epochs you wish
-                                    your model to train.
-    --do_train                      Boolean denoting if the model should be trained
-                                    or not. Default is false.
-    --do_eval                       Boolean denoting if the model should be evaluated
-                                    or not. Default is false.
-    --per_device_train_batch_size   Size of each training batch based on samples per GPU.
-                                    12 will fit in a 11gb GPU, 16 in a 16gb.
-    --per_device_eval_batch_size    Size of each training batch based on samples per GPU.
-                                    12 will fit in a 11gb GPU, 16 in a 16gb.
-    --learning_rate                 Learning rate initial float value. ex: 3e-5.
-    --max_seq_length                Int for the max sequence length to be parsed as a context
-                                    window. ex: 384 tokens.
-    --output_dir                    Path which model checkpoints and paths should be saved.
-    --overwrite_output_dir          Boolean to define if the
-    --cache_dir                     Directiory which cached transformer files(datasets, models
-                                    , tokenizers) are saved for fast loading.
-    --preprocessing_num_workers     The amount of cpu workers which are used to process datasets
-    --seed                          Int which determines what random seed is for training/shuffling
-    --nm_prune_config               Path to the neural magic prune configuration file. examples can
-                                    be found in prune_config_files but are customized for bert-base-uncased.
-    --do_onnx_export                Boolean denoting if the model should be exported to onnx
-    --onnx_export_path              Path where onnx model path will be exported. ex: onnx-export
-    --layers_to_keep                Number of layers to keep from original model. Layers are dropped before training
-
-##########
-Example command for training a 95% sparse BERT SQUAD model for 1 epoch with a unpruned teacher:
-python run_distill_qa.py \
-    --teacher_model_name_or_path models/neuralmagic-bert-squad-12layer-0sparse
-    --student_model_name_or_path bert-base-uncased \
-    --dataset_name squad \
-    --num_train_epochs 1 \
-    --do_train \
-    --do_eval \
-    --per_device_train_batch_size 12 \
-    --per_device_eval_batch_size 12 \
-    --learning_rate 3e-5 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir 95sparsity1epoch/ \
-    --overwrite_output_dir \
-    --cache_dir cache \
-    --preprocessing_num_workers 8 \
-    --seed 42 \
-    --nm_prune_config prune_config_files/95sparsity1epoch.yaml \
-    --do_onnx_export \
-    --onnx_export_path 95sparsity1epoch/ \
-    --distill_hardness 0.5 \
-    --temperature 2.0 \
-"""
-# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
-
-import logging
-import os
-import numpy as np
-import sys
-from dataclasses import dataclass, field
-from typing import Optional
-
-from datasets import load_dataset, load_metric
-
-import transformers
-from transformers import (
-    AutoConfig,
-    AutoModelForQuestionAnswering,
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    EvalPrediction,
-    HfArgumentParser,
-    PreTrainedTokenizerFast,
-    TrainingArguments,
-    default_data_collator,
-    set_seed,
-)
-from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version
-from utils_qa import postprocess_qa_predictions
-
-# Start SparseML integration
-from sparseml_utils import SparseMLDistillQATrainer, convert_example_to_features
-from sparseml.pytorch.utils import ModuleExporter
-# End SparseML integration
-
-
-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.7.0.dev0")
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-    teacher_model_name_or_path: Optional[str] = field(
-        default="spacemanidol/neuralmagic-bert-squad-12layer-0sparse", metadata={"help": "Teacher model which needs to be a trained QA model"}
-    )
-    student_model_name_or_path: Optional[str] = field(
-        default="bert-base-uncased", metadata={"help": "Student model"}
-    )
-    temperature: Optional[float] = field(
-        default=2.0, metadata={"help": "Temperature applied to teacher softmax for distillation."}
-    )
-    distill_hardness: Optional[float] = field(
-        default=1.0, metadata={"help": "Proportion of loss coming from teacher model."}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
-            "with private models)."
-        },
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    ####################################################################################
-    # Start SparseML Integration
-    ####################################################################################
-    nm_prune_config: Optional[str] = field(
-        default="recipes/noprune1epoch.yaml",
-        metadata={"help": "The input file name for the Neural Magic pruning config"},
-    )
-    do_onnx_export: bool = field(default=True, metadata={"help": "Export model to onnx"})
-    onnx_export_path: Optional[str] = field(
-        default="onnx-export", metadata={"help": "The filename and path which will be where onnx model is outputed"}
-    )
-    ####################################################################################
-    # End SparseML Integration
-    ####################################################################################
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
-    )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."},
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_seq_length: int = field(
-        default=384,
-        metadata={
-            "help": "The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        },
-    )
-    pad_to_max_length: bool = field(
-        default=True,
-        metadata={
-            "help": "Whether to pad all samples to `max_seq_length`. "
-            "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can "
-            "be faster on GPU but will be slower on TPU)."
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-            "value if set."
-        },
-    )
-    max_predict_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
-            "value if set."
-        },
-    )
-    version_2_with_negative: bool = field(
-        default=False, metadata={"help": "If true, some of the examples do not have an answer."}
-    )
-    null_score_diff_threshold: float = field(
-        default=0.0,
-        metadata={
-            "help": "The threshold used to select the null answer: if the best answer has a score that is less than "
-            "the score of the null answer minus this threshold, the null answer is selected for this example. "
-            "Only useful when `version_2_with_negative=True`."
-        },
-    )
-    doc_stride: int = field(
-        default=128,
-        metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
-    )
-    n_best_size: int = field(
-        default=20,
-        metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
-    )
-    max_answer_length: int = field(
-        default=30,
-        metadata={
-            "help": "The maximum length of an answer that can be generated. This is needed because the start "
-            "and end predictions are not conditioned on one another."
-        },
-    )
-
-    def __post_init__(self):
-        if (
-            self.dataset_name is None
-            and self.train_file is None
-            and self.validation_file is None
-            and self.test_file is None
-        ):
-            raise ValueError("Need either a dataset name or a training/validation file/test_file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-            if self.test_file is not None:
-                extension = self.test_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
-
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if training_args.should_log:
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
-    else:
-        data_files = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-            extension = data_args.train_file.split(".")[-1]
-
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-            extension = data_args.validation_file.split(".")[-1]
-        if data_args.test_file is not None:
-            data_files["test"] = data_args.test_file
-            extension = data_args.test_file.split(".")[-1]
-        datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.student_model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.student_model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=True,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-    )
-    student_model = AutoModelForQuestionAnswering.from_pretrained(
-        model_args.student_model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.student_model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-    )
-    teacher_model = None
-    if model_args.teacher_model_name_or_path != None:
-        teacher_model = AutoModelForQuestionAnswering.from_pretrained(
-            model_args.teacher_model_name_or_path,
-            from_tf=bool(".ckpt" in model_args.teacher_model_name_or_path),
-            config=config,
-            cache_dir=model_args.cache_dir,
-        )
-        teacher_model_parameters = filter(lambda p: p.requires_grad, teacher_model.parameters())
-        params = sum([np.prod(p.size()) for p in teacher_model_parameters])
-        logger.info("Teacher Model has %s parameters", params)   
-
-    student_model_parameters = filter(lambda p: p.requires_grad, student_model.parameters())
-    params = sum([np.prod(p.size()) for p in student_model_parameters])
-    logger.info("Student Model has %s parameters", params)
-
-    # Tokenizer check: this script requires a fast tokenizer.
-    if not isinstance(tokenizer, PreTrainedTokenizerFast):
-        raise ValueError(
-            "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
-            "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this "
-            "requirement"
-        )
-
-    # Preprocessing the datasets.
-    # Preprocessing is slighlty different for training and evaluation.
-    if training_args.do_train:
-        column_names = datasets["train"].column_names
-    elif training_args.do_eval:
-        column_names = datasets["validation"].column_names
-    else:
-        column_names = datasets["test"].column_names
-    question_column_name = "question" if "question" in column_names else column_names[0]
-    context_column_name = "context" if "context" in column_names else column_names[1]
-    answer_column_name = "answers" if "answers" in column_names else column_names[2]
-
-    # Padding side determines if we do (question|context) or (context|question).
-    pad_on_right = tokenizer.padding_side == "right"
-
-    if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warning(
-            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
-            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-        )
-    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-
-    # Training preprocessing
-    def prepare_train_features(examples):
-        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
-        # in one example possible giving several features when a context is long, each of those features having a
-        # context that overlaps a bit the context of the previous feature.
-        tokenized_examples = tokenizer(
-            examples[question_column_name if pad_on_right else context_column_name],
-            examples[context_column_name if pad_on_right else question_column_name],
-            truncation="only_second" if pad_on_right else "only_first",
-            max_length=max_seq_length,
-            stride=data_args.doc_stride,
-            return_overflowing_tokens=True,
-            return_offsets_mapping=True,
-            padding="max_length" if data_args.pad_to_max_length else False,
-        )
-
-        # Since one example might give us several features if it has a long context, we need a map from a feature to
-        # its corresponding example. This key gives us just that.
-        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
-        # The offset mappings will give us a map from token to character position in the original context. This will
-        # help us compute the start_positions and end_positions.
-        offset_mapping = tokenized_examples.pop("offset_mapping")
-
-        # Let's label those examples!
-        tokenized_examples["start_positions"] = []
-        tokenized_examples["end_positions"] = []
-
-        for i, offsets in enumerate(offset_mapping):
-            # We will label impossible answers with the index of the CLS token.
-            input_ids = tokenized_examples["input_ids"][i]
-            cls_index = input_ids.index(tokenizer.cls_token_id)
-
-            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
-            sequence_ids = tokenized_examples.sequence_ids(i)
-
-            # One example can give several spans, this is the index of the example containing this span of text.
-            sample_index = sample_mapping[i]
-            answers = examples[answer_column_name][sample_index]
-            # If no answers are given, set the cls_index as answer.
-            if len(answers["answer_start"]) == 0:
-                tokenized_examples["start_positions"].append(cls_index)
-                tokenized_examples["end_positions"].append(cls_index)
-            else:
-                # Start/end character index of the answer in the text.
-                start_char = answers["answer_start"][0]
-                end_char = start_char + len(answers["text"][0])
-
-                # Start token index of the current span in the text.
-                token_start_index = 0
-                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
-                    token_start_index += 1
-
-                # End token index of the current span in the text.
-                token_end_index = len(input_ids) - 1
-                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
-                    token_end_index -= 1
-
-                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
-                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
-                    tokenized_examples["start_positions"].append(cls_index)
-                    tokenized_examples["end_positions"].append(cls_index)
-                else:
-                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
-                    # Note: we could go after the last offset if the answer is the last word (edge case).
-                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
-                        token_start_index += 1
-                    tokenized_examples["start_positions"].append(token_start_index - 1)
-                    while offsets[token_end_index][1] >= end_char:
-                        token_end_index -= 1
-                    tokenized_examples["end_positions"].append(token_end_index + 1)
-
-        return tokenized_examples
-
-    if training_args.do_train:
-        if "train" not in datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = datasets["train"]
-        if data_args.max_train_samples is not None:
-            # We will select sample from whole data if agument is specified
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-        # Create train feature from dataset
-        train_dataset = train_dataset.map(
-            prepare_train_features,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-        if data_args.max_train_samples is not None:
-            # Number of samples might increase during Feature Creation, We select only specified max samples
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-
-    # Validation preprocessing
-    def prepare_validation_features(examples):
-        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
-        # in one example possible giving several features when a context is long, each of those features having a
-        # context that overlaps a bit the context of the previous feature.
-        tokenized_examples = tokenizer(
-            examples[question_column_name if pad_on_right else context_column_name],
-            examples[context_column_name if pad_on_right else question_column_name],
-            truncation="only_second" if pad_on_right else "only_first",
-            max_length=max_seq_length,
-            stride=data_args.doc_stride,
-            return_overflowing_tokens=True,
-            return_offsets_mapping=True,
-            padding="max_length" if data_args.pad_to_max_length else False,
-        )
-
-        # Since one example might give us several features if it has a long context, we need a map from a feature to
-        # its corresponding example. This key gives us just that.
-        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
-
-        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
-        # corresponding example_id and we will store the offset mappings.
-        tokenized_examples["example_id"] = []
-
-        for i in range(len(tokenized_examples["input_ids"])):
-            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
-            sequence_ids = tokenized_examples.sequence_ids(i)
-            context_index = 1 if pad_on_right else 0
-
-            # One example can give several spans, this is the index of the example containing this span of text.
-            sample_index = sample_mapping[i]
-            tokenized_examples["example_id"].append(examples["id"][sample_index])
-
-            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
-            # position is part of the context or not.
-            tokenized_examples["offset_mapping"][i] = [
-                (o if sequence_ids[k] == context_index else None)
-                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
-            ]
-
-        return tokenized_examples
-
-    if training_args.do_eval:
-        if "validation" not in datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_examples = datasets["validation"]
-        if data_args.max_eval_samples is not None:
-            # We will select sample from whole data
-            eval_examples = eval_examples.select(range(data_args.max_eval_samples))
-        # Validation Feature Creation
-        eval_dataset = eval_examples.map(
-            prepare_validation_features,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-        if data_args.max_eval_samples is not None:
-            # During Feature creation dataset samples might increase, we will select required samples again
-            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
-
-    if training_args.do_predict:
-        if "test" not in datasets:
-            raise ValueError("--do_predict requires a test dataset")
-        predict_examples = datasets["test"]
-        if data_args.max_predict_samples is not None:
-            # We will select sample from whole data
-            predict_examples = predict_examples.select(range(data_args.max_predict_samples))
-        # Predict Feature Creation
-        predict_dataset = predict_examples.map(
-            prepare_validation_features,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-        if data_args.max_predict_samples is not None:
-            # During Feature creation dataset samples might increase, we will select required samples again
-            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
-
-    # Data collator
-    # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
-    # collator.
-    data_collator = (
-        default_data_collator
-        if data_args.pad_to_max_length
-        else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
-    )
-
-    # Post-processing:
-    def post_processing_function(examples, features, predictions, stage="eval"):
-        # Post-processing: we match the start logits and end logits to answers in the original context.
-        predictions = postprocess_qa_predictions(
-            examples=examples,
-            features=features,
-            predictions=predictions,
-            version_2_with_negative=data_args.version_2_with_negative,
-            n_best_size=data_args.n_best_size,
-            max_answer_length=data_args.max_answer_length,
-            null_score_diff_threshold=data_args.null_score_diff_threshold,
-            output_dir=training_args.output_dir,
-            is_world_process_zero=trainer.is_world_process_zero(),
-            prefix=stage,
-        )
-        # Format the result to the format the metric expects.
-        if data_args.version_2_with_negative:
-            formatted_predictions = [
-                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
-            ]
-        else:
-            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
-
-        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
-        return EvalPrediction(predictions=formatted_predictions, label_ids=references)
-
-    metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
-
-    def compute_metrics(p: EvalPrediction):
-        return metric.compute(predictions=p.predictions, references=p.label_ids)
-
-    ####################################################################################
-    # Start SparseML Integration
-    ####################################################################################
-    # Initialize our Trainer
-    trainer = SparseMLDistillQATrainer(
-        data_args.nm_prune_config,
-        teacher=teacher_model,
-        distill_hardness = model_args.distill_hardness,
-        temperature = model_args.temperature,
-        model=student_model,
-        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
-        eval_examples=eval_examples if training_args.do_eval else None,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        post_process_function=post_processing_function,
-        compute_metrics=compute_metrics,
-    )
-    ####################################################################################
-    # End SparseML Integration
-    ####################################################################################
-
-    # Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()  # Saves the tokenizer too for easy upload
-
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate()
-
-        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    # Prediction
-    if training_args.do_predict:
-        logger.info("*** Predict ***")
-        results = trainer.predict(predict_dataset, predict_examples)
-        metrics = results.metrics
-
-        max_predict_samples = (
-            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
-        )
-        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
-
-        trainer.log_metrics("predict", metrics)
-        trainer.save_metrics("predict", metrics)
-
-    if training_args.push_to_hub:
-        kwargs = {"finetuned_from": model_args.model_name_or_path, "tags": "question-answering"}
-        if data_args.dataset_name is not None:
-            kwargs["dataset_tags"] = data_args.dataset_name
-            if data_args.dataset_config_name is not None:
-                kwargs["dataset_args"] = data_args.dataset_config_name
-                kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-            else:
-                kwargs["dataset"] = data_args.dataset_name
-
-        trainer.push_to_hub(**kwargs)
-
-    ####################################################################################
-    # Start SparseML Integration
-    ####################################################################################
-    if data_args.do_onnx_export:
-        logger.info("*** Export to ONNX ***")
-        os.environ["TOKENIZERS_PARALLELISM"] = "false"
-        exporter = ModuleExporter(
-            student_model, output_dir=data_args.onnx_export_path
-        )
-        sample_batch = convert_example_to_features(
-            datasets["validation"][0],
-            tokenizer,
-            data_args.max_seq_length,
-            data_args.doc_stride
-        )
-        exporter.export_onnx(sample_batch=sample_batch, convert_qat=True)
-    ####################################################################################
-    # End SparseML Integration
-    ####################################################################################
-
-def _mp_fn(index):
-    # For xla_spawn (TPUs)
-    main()
-
-
-if __name__ == "__main__":
-    main()

From 9f58d85796a1d1d3c1e06418f0bb57e5c0229f65 Mon Sep 17 00:00:00 2001
From: Tuan Nguyen <tuan@neuralmagic.com>
Date: Thu, 10 Jun 2021 07:27:20 -0400
Subject: [PATCH 5/7] Include distillation into run_qa, code clean up

---
 examples/pytorch/question-answering/run_qa.py | 160 ++++--------------
 1 file changed, 35 insertions(+), 125 deletions(-)

diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index 009613b38a46..5fac8304002a 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -14,89 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Example script for integrating spaseml with the transformers library.
-This script is addopted from hugging face's implementation for Question Answering on the SQUAD Dataset.
-Hugging Face's original implementation is regularly updated and can be found at https://github.com/huggingface/transformers/blob/master/examples/question-answering/run_qa.py
-This script will:
-- Load transformer based modesl
-- Load a sparseml training and pruning optimizer
-- Train on SQUAD
-- Evaluate on SQUAD
-- Export model to onnx.
-##########
-Command help:
-usage: run_qa.py [-h] \
-    --model_name_or_path MODEL \
-    [--dataset_name]  \
-    [--num_train_epochs] \
-    [--do_train] \
-    [--do_eval] \
-    [--per_device_train_batch_size] \
-    [--per_device_eval_batch_size] \
-    [--learning_rate]\
-    [--max_seq_length]\
-    [--doc_stride]\
-    [--output_dir] \
-    [--overwrite_output_dir] \
-    [--cache_dir]\
-    [--preprocessing_num_workers] \
-    [--seed] 42 \
-    [--nm_prune_config]
-    [--do_onnx_export]
-    [--onnx_export_path]
-
-Train, prune, and evaluate a transformer base question answering model on squad.
-    -h, --help            show this help message and exit
-    --model_name_or_path MODEL      The path to the transformers model you wish to train
-                                    or the name of the pretrained language model you wish
-                                    to use. ex: bert-base-uncased.
-    --dataset_name                  The name of which dataset you want to use to train or
-                                    your model. ex: squad for using SQuAD.
-    --num_train_epochs              Paramater to control how many training epochs you wish
-                                    your model to train.
-    --do_train                      Boolean denoting if the model should be trained
-                                    or not. Default is false.
-    --do_eval                       Boolean denoting if the model should be evaluated
-                                    or not. Default is false.
-    --per_device_train_batch_size   Size of each training batch based on samples per GPU.
-                                    12 will fit in a 11gb GPU, 16 in a 16gb.
-    --per_device_eval_batch_size    Size of each training batch based on samples per GPU.
-                                    12 will fit in a 11gb GPU, 16 in a 16gb.
-    --learning_rate                 Learning rate initial float value. ex: 3e-5.
-    --max_seq_length                Int for the max sequence length to be parsed as a context
-                                    window. ex: 384 tokens.
-    --output_dir                    Path which model checkpoints and paths should be saved.
-    --overwrite_output_dir          Boolean to define if the
-    --cache_dir                     Directiory which cached transformer files(datasets, models
-                                    , tokenizers) are saved for fast loading.
-    --preprocessing_num_workers     The amount of cpu workers which are used to process datasets
-    --seed                          Int which determines what random seed is for training/shuffling
-    --nm_prune_config               Path to the neural magic prune configuration file. examples can
-                                    be found in prune_config_files but are customized for bert-base-uncased.
-    --do_onnx_export                Boolean denoting if the model should be exported to onnx
-    --onnx_export_path              Path where onnx model path will be exported. ex: onnx-export
-
-##########
-Example command for training a 95% sparse BERT SQUAD model for 1 epoch:
-python run_qa.py \
-    --model_name_or_path bert-base-uncased \
-    --dataset_name squad \
-    --num_train_epochs 1 \
-    --do_train \
-    --do_eval \
-    --per_device_train_batch_size 12 \
-    --per_device_eval_batch_size 12 \
-    --learning_rate 3e-5 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir 95sparsity1epoch/ \
-    --overwrite_output_dir \
-    --cache_dir cache \
-    --preprocessing_num_workers 8 \
-    --seed 42 \
-    --nm_prune_config prune_config_files/95sparsity1epoch.yaml \
-    --do_onnx_export \
-    --onnx_export_path 95sparsity1epoch/
+Fine-tuning the library models for question answering.
 """
 # You can also adapt this script on your own question answering task. Pointers for this are left as comments.
 
@@ -106,9 +24,11 @@
 from dataclasses import dataclass, field
 from typing import Optional
 
+import numpy
 from datasets import load_dataset, load_metric
 
 import transformers
+from sparseml_utils import SparseMLQATrainer, export_model
 from transformers import (
     AutoConfig,
     AutoModelForQuestionAnswering,
@@ -125,11 +45,6 @@
 from transformers.utils import check_min_version
 from utils_qa import postprocess_qa_predictions
 
-# Start SparseML integration
-from sparseml_utils import SparseMLQATrainer, convert_example_to_features
-from sparseml.pytorch.utils import ModuleExporter
-# End SparseML integration
-
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 check_min_version("4.7.0.dev0")
@@ -142,10 +57,18 @@ class ModelArguments:
     """
     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
     """
-
     model_name_or_path: str = field(
         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
     )
+    teacher_model_name_or_path: Optional[str] = field(
+        default=None, metadata={"help": "Teacher model which needs to be a trained QA model"}
+    )
+    temperature: Optional[float] = field(
+        default=2.0, metadata={"help": "Temperature applied to teacher softmax for distillation."}
+    )
+    distill_hardness: Optional[float] = field(
+        default=1.0, metadata={"help": "Proportion of loss coming from teacher model."}
+    )
     config_name: Optional[str] = field(
         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
     )
@@ -175,21 +98,13 @@ class DataTrainingArguments:
     Arguments pertaining to what data we are going to input our model for training and eval.
     """
 
-    ####################################################################################
-    # Start SparseML Integration
-    ####################################################################################
-    nm_prune_config: Optional[str] = field(
-        default="recipes/noprune1epoch.yaml",
+    recipe: Optional[str] = field(
+        default=None,
         metadata={"help": "The input file name for the Neural Magic pruning config"},
     )
-    do_onnx_export: bool = field(default=True, metadata={"help": "Export model to onnx"})
     onnx_export_path: Optional[str] = field(
-        default="onnx-export", metadata={"help": "The filename and path which will be where onnx model is outputed"}
+        default=None, metadata={"help": "The filename and path which will be where onnx model is outputed"}
     )
-    ####################################################################################
-    # End SparseML Integration
-    ####################################################################################
-
     dataset_name: Optional[str] = field(
         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
     )
@@ -401,6 +316,18 @@ def main():
         use_auth_token=True if model_args.use_auth_token else None,
     )
 
+    teacher_model = None
+    if model_args.teacher_model_name_or_path is not None:
+        teacher_model = AutoModelForQuestionAnswering.from_pretrained(
+            model_args.teacher_model_name_or_path,
+            from_tf=bool(".ckpt" in model_args.teacher_model_name_or_path),
+            config=config,
+            cache_dir=model_args.cache_dir,
+        )
+        teacher_model_parameters = filter(lambda p: p.requires_grad, teacher_model.parameters())
+        params = sum([numpy.prod(p.size()) for p in teacher_model_parameters])
+        logger.info("Teacher Model has %s parameters", params)
+
     # Tokenizer check: this script requires a fast tokenizer.
     if not isinstance(tokenizer, PreTrainedTokenizerFast):
         raise ValueError(
@@ -643,12 +570,12 @@ def post_processing_function(examples, features, predictions, stage="eval"):
     def compute_metrics(p: EvalPrediction):
         return metric.compute(predictions=p.predictions, references=p.label_ids)
 
-    ####################################################################################
-    # Start SparseML Integration
-    ####################################################################################
     # Initialize our Trainer
     trainer = SparseMLQATrainer(
-        data_args.nm_prune_config,
+        data_args.recipe,
+        teacher=teacher_model,
+        distill_hardness=model_args.distill_hardness,
+        temperature=model_args.temperature,
         model=model,
         args=training_args,
         train_dataset=train_dataset if training_args.do_train else None,
@@ -659,9 +586,6 @@ def compute_metrics(p: EvalPrediction):
         post_process_function=post_processing_function,
         compute_metrics=compute_metrics,
     )
-    ####################################################################################
-    # End SparseML Integration
-    ####################################################################################
 
     # Training
     if training_args.do_train:
@@ -720,25 +644,11 @@ def compute_metrics(p: EvalPrediction):
 
         trainer.push_to_hub(**kwargs)
 
-    ####################################################################################
-    # Start SparseML Integration
-    ####################################################################################
-    if data_args.do_onnx_export:
+    if data_args.onnx_export_path:
         logger.info("*** Export to ONNX ***")
-        os.environ["TOKENIZERS_PARALLELISM"] = "false"
-        exporter = ModuleExporter(
-            model, output_dir=data_args.onnx_export_path
-        )
-        sample_batch = convert_example_to_features(
-            datasets["validation"][0],
-            tokenizer,
-            data_args.max_seq_length,
-            data_args.doc_stride
-        )
-        exporter.export_onnx(sample_batch=sample_batch, convert_qat=True)
-    ####################################################################################
-    # End SparseML Integration
-    ####################################################################################
+        eval_dataloader = trainer.get_eval_dataloader(eval_dataset)
+        export_model(model, eval_dataloader, data_args.onnx_export_path)
+
 
 def _mp_fn(index):
     # For xla_spawn (TPUs)

From cea4db3d82d99f6e5fc3e617324b3712a83ef7e6 Mon Sep 17 00:00:00 2001
From: Tuan Nguyen <tuan@neuralmagic.com>
Date: Thu, 10 Jun 2021 07:28:23 -0400
Subject: [PATCH 6/7] Unify distill/non-distill trainer, simplify onnx export

---
 .../question-answering/sparseml_utils.py      | 173 +++++-------------
 1 file changed, 41 insertions(+), 132 deletions(-)

diff --git a/examples/pytorch/question-answering/sparseml_utils.py b/examples/pytorch/question-answering/sparseml_utils.py
index f0c1f124a4ea..8e0aad3b52e4 100644
--- a/examples/pytorch/question-answering/sparseml_utils.py
+++ b/examples/pytorch/question-answering/sparseml_utils.py
@@ -1,42 +1,52 @@
-import collections
 import math
+
 import torch
 import torch.nn.functional as F
-import numpy
-from trainer_qa import QuestionAnsweringTrainer
 
 from sparseml.pytorch.optim.manager import ScheduledModifierManager
 from sparseml.pytorch.optim.optimizer import ScheduledOptimizer
-
-from sparseml.pytorch.utils import logger
+from sparseml.pytorch.utils import ModuleExporter, logger
+from trainer_qa import QuestionAnsweringTrainer
 
 
 class SparseMLQATrainer(QuestionAnsweringTrainer):
     """
-    Question Answering trainer with customized optimizer using SparseML
+    Question Answering trainer with SparseML integration
 
-    :param nm_prune_config: recipe for model sparsification
+    :param recipe: recipe for model sparsification
+    :param teacher: teacher model for distillation
+    :param distill_hardness: ratio of loss by teacher targets (between 0 and 1)
+    :param temperature: temperature for distillation
     :param args, kwargs: arguments passed into parent class
     """
 
-    def __init__(self, nm_prune_config, *args, **kwargs):
+    def __init__(self, recipe, teacher=None, distill_hardness=0.5, temperature=2.0, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.nm_prune_config = nm_prune_config
+        self.recipe = recipe
+        self.teacher = teacher
+        self.distill_hardness = distill_hardness
+        self.temperature = temperature
+        self.criterion = torch.nn.CrossEntropyLoss()
+
         self.manager = None
-        loggers = []
-        if "wandb" in self.args.report_to:
-            loggers.append(logger.WANDBLogger())
-        self.loggers = loggers
+        self.loggers = None
+        if self.recipe is not None:
+            loggers = []
+            if "wandb" in self.args.report_to:
+                loggers.append(logger.WANDBLogger())
+            self.loggers = loggers
 
     def create_optimizer(self):
         """
         Create optimizer customized using SparseML
         """
         super().create_optimizer()
+        if self.recipe is None:
+            return
         steps_per_epoch = math.ceil(
             len(self.train_dataset) / (self.args.per_device_train_batch_size * self.args._n_gpu)
         )
-        self.manager = ScheduledModifierManager.from_yaml(self.nm_prune_config)
+        self.manager = ScheduledModifierManager.from_yaml(self.recipe)
         self.args.num_train_epochs = float(self.manager.max_epochs)
         if hasattr(self, "scaler"):
             self.manager.initialize(self.model, epoch=0.0, loggers=self.loggers)
@@ -48,38 +58,23 @@ def create_optimizer(self):
                 self.optimizer, self.model, self.manager, steps_per_epoch=steps_per_epoch, loggers=self.loggers
             )
 
-
-class SparseMLDistillQATrainer(SparseMLQATrainer):
-    """
-    Question Answering trainer using distilation with customized optimizer using SparseML
-
-    :param nm_prune_config: recipe for model sparsification
-    :param teacher: teacher model
-    :param distill_hardness: weight of the teacher loss
-    :param temperature: temperature used for loss
-    :param args, kwargs: arguments passed into parent class
-    """
-
-    def __init__(self, nm_prune_config, teacher=None, distill_hardness=0.5, temperature=2.0, *args, **kwargs):
-        super().__init__(nm_prune_config, *args, **kwargs)
-        self.teacher = teacher
-        self.distill_hardness = distill_hardness
-        self.temperature = temperature
-        self.criterion = torch.nn.CrossEntropyLoss()
-
     def compute_loss(self, model, inputs, return_outputs=False):
         """
         Computing loss using teacher/student distillation
         """
+        if self.recipe is None or self.teacher is None:
+            return super().compute_loss(model, inputs, return_outputs=return_outputs)
+
         outputs = model(**inputs)
-        loss = outputs["loss"]
-        if self.teacher is not None:
+        if self.teacher is None:
+            loss = outputs["loss"]
+        else:
             input_device = inputs["input_ids"].device
             self.teacher = self.teacher.to(input_device)
             start_logits_student = outputs["start_logits"]
             end_logits_student = outputs["end_logits"]
             start_logits_label = inputs["start_positions"]
-            end_logits_label = inputs["start_positions"]
+            end_logits_label = inputs["end_positions"]
             with torch.no_grad():
                 teacher_output = self.teacher(
                     input_ids=inputs["input_ids"],
@@ -112,101 +107,15 @@ def compute_loss(self, model, inputs, return_outputs=False):
         return (loss, outputs) if return_outputs else loss
 
 
-def convert_example_to_features(example, tokenizer, max_seq_length, doc_stride, max_query_length=30):
+def export_model(model, dataloader, output_dir):
     """
-    Convert example to features, used for onnx export
+    Export a trained model to ONNX
+    :param model: trained model
+    :param dataloader: dataloader to get sample batch
+    :param output_dir: output directory for ONNX model
     """
-    Feature = collections.namedtuple(
-        "Feature",
-        [
-            "unique_id",
-            "tokens",
-            "example_index",
-            "token_to_orig_map",
-            "token_is_max_context",
-        ],
-    )
-    extra = []
-    unique_id = 0
-    query_tokens = tokenizer.tokenize(example["question"])[0:max_query_length]
-    tok_to_orig_index = []
-    orig_to_tok_index = []
-    all_doc_tokens = []
-    for (i, token) in enumerate(example["context"]):
-        orig_to_tok_index.append(len(all_doc_tokens))
-        sub_tokens = tokenizer.tokenize(token)
-        for sub_token in sub_tokens:
-            tok_to_orig_index.append(i)
-            all_doc_tokens.append(sub_token)
-    max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-    _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
-    doc_spans = []
-    start_offset = 0
-    while start_offset < len(all_doc_tokens):
-        length = len(all_doc_tokens) - start_offset
-        if length > max_tokens_for_doc:
-            length = max_tokens_for_doc
-        doc_spans.append(_DocSpan(start=start_offset, length=length))
-        if start_offset + length == len(all_doc_tokens):
-            break
-        start_offset += min(length, doc_stride)
-    for (doc_span_index, doc_span) in enumerate(doc_spans):
-        tokens = []
-        token_to_orig_map = {}
-        token_is_max_context = {}
-        segment_ids = []
-        tokens.append("[CLS]")
-        segment_ids.append(0)
-        for token in query_tokens:
-            tokens.append(token)
-            segment_ids.append(0)
-        tokens.append("[SEP]")
-        segment_ids.append(0)
-        for i in range(doc_span.length):
-            split_token_index = doc_span.start + i
-            token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
-            is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index)
-            token_is_max_context[len(tokens)] = is_max_context
-            tokens.append(all_doc_tokens[split_token_index])
-            segment_ids.append(1)
-        tokens.append("[SEP]")
-        segment_ids.append(1)
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-        input_mask = [1] * len(input_ids)
-        while len(input_ids) < max_seq_length:
-            input_ids.append(0)
-            input_mask.append(0)
-            segment_ids.append(0)
-        feature = Feature(
-            unique_id=unique_id,
-            tokens=tokens,
-            example_index=0,
-            token_to_orig_map=token_to_orig_map,
-            token_is_max_context=token_is_max_context,
-        )
-        extra.append(feature)
-        unique_id += 1
-        # extra is used as additional data but sparseml doesn't support it
-    return (
-        torch.from_numpy(numpy.array([numpy.array(input_ids, dtype=numpy.int64)])),
-        torch.from_numpy(numpy.array([numpy.array(input_mask, dtype=numpy.int64)])),
-        torch.from_numpy(numpy.array([numpy.array(segment_ids, dtype=numpy.int64)])),
-    )
-
-
-def _check_is_max_context(doc_spans, cur_span_index, position):
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-    return cur_span_index == best_span_index
+    exporter = ModuleExporter(model, output_dir=output_dir)
+    for _, sample_batch in enumerate(dataloader):
+        sample_input = (sample_batch["input_ids"], sample_batch["attention_mask"], sample_batch["token_type_ids"])
+        exporter.export_onnx(sample_batch=sample_input, convert_qat=True)
+        break

From dcaa723f85a9f4de64cebffd836d5ff32bad11e3 Mon Sep 17 00:00:00 2001
From: Tuan Nguyen <tuan@neuralmagic.com>
Date: Thu, 10 Jun 2021 15:55:00 -0400
Subject: [PATCH 7/7] Simplify variable names for distillation

---
 examples/pytorch/question-answering/run_qa.py  | 15 ++++++++-------
 .../question-answering/sparseml_utils.py       | 18 +++++++++---------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index 5fac8304002a..0cdef4787b4c 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -60,10 +60,10 @@ class ModelArguments:
     model_name_or_path: str = field(
         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
     )
-    teacher_model_name_or_path: Optional[str] = field(
+    distill_teacher: Optional[str] = field(
         default=None, metadata={"help": "Teacher model which needs to be a trained QA model"}
     )
-    temperature: Optional[float] = field(
+    distill_temperature: Optional[float] = field(
         default=2.0, metadata={"help": "Temperature applied to teacher softmax for distillation."}
     )
     distill_hardness: Optional[float] = field(
@@ -100,7 +100,8 @@ class DataTrainingArguments:
 
     recipe: Optional[str] = field(
         default=None,
-        metadata={"help": "The input file name for the Neural Magic pruning config"},
+        metadata={"help": "Path to a SparseML sparsification recipe, see https://github.com/neuralmagic/sparseml "
+                  "for more information"},
     )
     onnx_export_path: Optional[str] = field(
         default=None, metadata={"help": "The filename and path which will be where onnx model is outputed"}
@@ -317,10 +318,10 @@ def main():
     )
 
     teacher_model = None
-    if model_args.teacher_model_name_or_path is not None:
+    if model_args.distill_teacher is not None:
         teacher_model = AutoModelForQuestionAnswering.from_pretrained(
-            model_args.teacher_model_name_or_path,
-            from_tf=bool(".ckpt" in model_args.teacher_model_name_or_path),
+            model_args.distill_teacher,
+            from_tf=bool(".ckpt" in model_args.distill_teacher),
             config=config,
             cache_dir=model_args.cache_dir,
         )
@@ -575,7 +576,7 @@ def compute_metrics(p: EvalPrediction):
         data_args.recipe,
         teacher=teacher_model,
         distill_hardness=model_args.distill_hardness,
-        temperature=model_args.temperature,
+        distill_temperature=model_args.distill_temperature,
         model=model,
         args=training_args,
         train_dataset=train_dataset if training_args.do_train else None,
diff --git a/examples/pytorch/question-answering/sparseml_utils.py b/examples/pytorch/question-answering/sparseml_utils.py
index 8e0aad3b52e4..ca30f7c61954 100644
--- a/examples/pytorch/question-answering/sparseml_utils.py
+++ b/examples/pytorch/question-answering/sparseml_utils.py
@@ -16,16 +16,16 @@ class SparseMLQATrainer(QuestionAnsweringTrainer):
     :param recipe: recipe for model sparsification
     :param teacher: teacher model for distillation
     :param distill_hardness: ratio of loss by teacher targets (between 0 and 1)
-    :param temperature: temperature for distillation
+    :param distill_temperature: temperature for distillation
     :param args, kwargs: arguments passed into parent class
     """
 
-    def __init__(self, recipe, teacher=None, distill_hardness=0.5, temperature=2.0, *args, **kwargs):
+    def __init__(self, recipe, teacher=None, distill_hardness=0.5, distill_temperature=2.0, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.recipe = recipe
         self.teacher = teacher
         self.distill_hardness = distill_hardness
-        self.temperature = temperature
+        self.distill_temperature = distill_temperature
         self.criterion = torch.nn.CrossEntropyLoss()
 
         self.manager = None
@@ -85,19 +85,19 @@ def compute_loss(self, model, inputs, return_outputs=False):
             end_logits_teacher = teacher_output["end_logits"]
             loss_start = (
                 F.kl_div(
-                    input=F.log_softmax(start_logits_student / self.temperature, dim=-1),
-                    target=F.softmax(start_logits_teacher / self.temperature, dim=-1),
+                    input=F.log_softmax(start_logits_student / self.distill_temperature, dim=-1),
+                    target=F.softmax(start_logits_teacher / self.distill_temperature, dim=-1),
                     reduction="batchmean",
                 )
-                * (self.temperature ** 2)
+                * (self.distill_temperature ** 2)
             )
             loss_end = (
                 F.kl_div(
-                    input=F.log_softmax(end_logits_student / self.temperature, dim=-1),
-                    target=F.softmax(end_logits_teacher / self.temperature, dim=-1),
+                    input=F.log_softmax(end_logits_student / self.distill_temperature, dim=-1),
+                    target=F.softmax(end_logits_teacher / self.distill_temperature, dim=-1),
                     reduction="batchmean",
                 )
-                * (self.temperature ** 2)
+                * (self.distill_temperature ** 2)
             )
             teacher_loss = (loss_start + loss_end) / 2.0
             loss_start = self.criterion(start_logits_student, start_logits_label)