From a65d175e1aed9e57e2b64be1054478fc22623a2a Mon Sep 17 00:00:00 2001
From: Eldar Kurtic <eldar.ciki@gmail.com>
Date: Wed, 28 Apr 2021 14:29:24 +0200
Subject: [PATCH] Fix steps_per_epoch calculation

---
 integrations/transformers/run_distill_qa.py | 47 +++++++++++----------
 integrations/transformers/run_qa.py         | 45 ++++++++++----------
 2 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/integrations/transformers/run_distill_qa.py b/integrations/transformers/run_distill_qa.py
index ddaeca51dbd..4c9fc69e359 100644
--- a/integrations/transformers/run_distill_qa.py
+++ b/integrations/transformers/run_distill_qa.py
@@ -19,8 +19,8 @@
 # limitations under the License.
 
 """
-Example script for integrating spaseml with the transformers library to perform model distillation. 
-This script is addopted from hugging face's implementation for Question Answering on the SQUAD Dataset. 
+Example script for integrating spaseml with the transformers library to perform model distillation.
+This script is addopted from hugging face's implementation for Question Answering on the SQUAD Dataset.
 Hugging Face's original implementation is regularly updated and can be found at https://github.com/huggingface/transformers/blob/master/examples/question-answering/run_qa.py
 This script will:
 - Load transformer based models
@@ -54,12 +54,12 @@
     [--onnx_export_path] \
     [--layers_to_keep] \
 
-Train, prune, and evaluate a transformer base question answering model on squad. 
+Train, prune, and evaluate a transformer base question answering model on squad.
     -h, --help            show this help message and exit
     --teacher_model_name_or_path    The name or path of model which will be used for distilation.
                                     Note, this model needs to be trained for QA task already.
     --student_model_name_or_path    The name or path of the model wich will be trained using distilation.
-    --temperature                   Hyperparameter which controls model distilation 
+    --temperature                   Hyperparameter which controls model distilation
     --distill_hardness              Hyperparameter which controls how much of the loss comes from teacher vs training labels
     --model_name_or_path            The path to the transformers model you wish to train
                                     or the name of the pretrained language model you wish
@@ -72,21 +72,21 @@
                                     or not. Default is false.
     --do_eval                       Boolean denoting if the model should be evaluated
                                     or not. Default is false.
-    --per_device_train_batch_size   Size of each training batch based on samples per GPU. 
+    --per_device_train_batch_size   Size of each training batch based on samples per GPU.
                                     12 will fit in a 11gb GPU, 16 in a 16gb.
-    --per_device_eval_batch_size    Size of each training batch based on samples per GPU. 
+    --per_device_eval_batch_size    Size of each training batch based on samples per GPU.
                                     12 will fit in a 11gb GPU, 16 in a 16gb.
     --learning_rate                 Learning rate initial float value. ex: 3e-5.
-    --max_seq_length                Int for the max sequence length to be parsed as a context 
+    --max_seq_length                Int for the max sequence length to be parsed as a context
                                     window. ex: 384 tokens.
     --output_dir                    Path which model checkpoints and paths should be saved.
-    --overwrite_output_dir          Boolean to define if the 
+    --overwrite_output_dir          Boolean to define if the
     --cache_dir                     Directiory which cached transformer files(datasets, models
-                                    , tokenizers) are saved for fast loading. 
+                                    , tokenizers) are saved for fast loading.
     --preprocessing_num_workers     The amount of cpu workers which are used to process datasets
     --seed                          Int which determines what random seed is for training/shuffling
     --nm_prune_config               Path to the neural magic prune configuration file. examples can
-                                    be found in prune_config_files but are customized for bert-base-uncased. 
+                                    be found in prune_config_files but are customized for bert-base-uncased.
     --do_onnx_export                Boolean denoting if the model should be exported to onnx
     --onnx_export_path              Path where onnx model path will be exported. ex: onnx-export
     --layers_to_keep                Number of layers to keep from original model. Layers are dropped before training
@@ -611,7 +611,7 @@ def prepare_validation_features(examples):
             ]
         return tokenized_examples
 
-    transformers.utils.logging.set_verbosity_info()   
+    transformers.utils.logging.set_verbosity_info()
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
@@ -639,7 +639,7 @@ def prepare_validation_features(examples):
     )
 
     logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
     )
 
@@ -690,10 +690,10 @@ def prepare_validation_features(examples):
 
     student_model_parameters = filter(lambda p: p.requires_grad, student_model.parameters())
     params = sum([np.prod(p.size()) for p in student_model_parameters])
-    logger.info("Student Model has %s parameters", params) 
+    logger.info("Student Model has %s parameters", params)
     teacher_model_parameters = filter(lambda p: p.requires_grad, teacher_model.parameters())
     params = sum([np.prod(p.size()) for p in teacher_model_parameters])
-    logger.info("Teacher Model has %s parameters", params)   
+    logger.info("Teacher Model has %s parameters", params)
     # Tokenizer check: this script requires a fast tokenizer.
     if not isinstance(tokenizer, PreTrainedTokenizerFast):
         raise ValueError(
@@ -710,7 +710,7 @@ def prepare_validation_features(examples):
     context_column_name = "context" if "context" in column_names else column_names[1]
     answer_column_name = "answers" if "answers" in column_names else column_names[2]
 
-    pad_on_right = tokenizer.padding_side == "right"  
+    pad_on_right = tokenizer.padding_side == "right"
 
     data_collator = (
         default_data_collator
@@ -744,15 +744,16 @@ def prepare_validation_features(examples):
         )
     ####################################################################################
     # Start SparseML Integration
-    #################################################################################### 
-    optim = load_optimizer(student_model, TrainingArguments)
-    steps_per_epoch = math.ceil(len(datasets["train"]) / (training_args.per_device_train_batch_size*training_args._n_gpu))
-    manager = ScheduledModifierManager.from_yaml(data_args.nm_prune_config)
-    training_args.num_train_epochs = float(manager.modifiers[0].end_epoch)
-    optim = ScheduledOptimizer(optim, student_model, manager, steps_per_epoch=steps_per_epoch, loggers=None)
+    ####################################################################################
+    if training_args.do_train:
+        optim = load_optimizer(student_model, TrainingArguments)
+        steps_per_epoch = math.ceil(len(train_dataset) / (training_args.per_device_train_batch_size * training_args._n_gpu))
+        manager = ScheduledModifierManager.from_yaml(data_args.nm_prune_config)
+        training_args.num_train_epochs = float(manager.modifiers[0].end_epoch)
+        optim = ScheduledOptimizer(optim, student_model, manager, steps_per_epoch=steps_per_epoch, loggers=None)
     ####################################################################################
     # End SparseML Integration
-    ####################################################################################    
+    ####################################################################################
     # Initialize our Trainer
     trainer = DistillQuestionAnsweringTrainer(
         model=student_model,
@@ -764,7 +765,7 @@ def prepare_validation_features(examples):
         data_collator=data_collator,
         post_process_function=post_processing_function,
         compute_metrics=compute_metrics,
-        optimizers=(optim, None),
+        optimizers=(optim, None) if training_args.do_train else (None, None),
         teacher=teacher_model,
         distill_hardness = model_args.distill_hardness,
         temperature = model_args.temperature,
diff --git a/integrations/transformers/run_qa.py b/integrations/transformers/run_qa.py
index b6f683bc13d..a9ad798ac89 100644
--- a/integrations/transformers/run_qa.py
+++ b/integrations/transformers/run_qa.py
@@ -19,8 +19,8 @@
 # limitations under the License.
 
 """
-Example script for integrating spaseml with the transformers library. 
-This script is addopted from hugging face's implementation for Question Answering on the SQUAD Dataset. 
+Example script for integrating spaseml with the transformers library.
+This script is addopted from hugging face's implementation for Question Answering on the SQUAD Dataset.
 Hugging Face's original implementation is regularly updated and can be found at https://github.com/huggingface/transformers/blob/master/examples/question-answering/run_qa.py
 This script will:
 - Load transformer based modesl
@@ -50,7 +50,7 @@
     [--do_onnx_export]
     [--onnx_export_path]
 
-Train, prune, and evaluate a transformer base question answering model on squad. 
+Train, prune, and evaluate a transformer base question answering model on squad.
     -h, --help            show this help message and exit
     --model_name_or_path MODEL      The path to the transformers model you wish to train
                                     or the name of the pretrained language model you wish
@@ -63,21 +63,21 @@
                                     or not. Default is false.
     --do_eval                       Boolean denoting if the model should be evaluated
                                     or not. Default is false.
-    --per_device_train_batch_size   Size of each training batch based on samples per GPU. 
+    --per_device_train_batch_size   Size of each training batch based on samples per GPU.
                                     12 will fit in a 11gb GPU, 16 in a 16gb.
-    --per_device_eval_batch_size    Size of each training batch based on samples per GPU. 
+    --per_device_eval_batch_size    Size of each training batch based on samples per GPU.
                                     12 will fit in a 11gb GPU, 16 in a 16gb.
     --learning_rate                 Learning rate initial float value. ex: 3e-5.
-    --max_seq_length                Int for the max sequence length to be parsed as a context 
+    --max_seq_length                Int for the max sequence length to be parsed as a context
                                     window. ex: 384 tokens.
     --output_dir                    Path which model checkpoints and paths should be saved.
-    --overwrite_output_dir          Boolean to define if the 
+    --overwrite_output_dir          Boolean to define if the
     --cache_dir                     Directiory which cached transformer files(datasets, models
-                                    , tokenizers) are saved for fast loading. 
+                                    , tokenizers) are saved for fast loading.
     --preprocessing_num_workers     The amount of cpu workers which are used to process datasets
     --seed                          Int which determines what random seed is for training/shuffling
     --nm_prune_config               Path to the neural magic prune configuration file. examples can
-                                    be found in prune_config_files but are customized for bert-base-uncased. 
+                                    be found in prune_config_files but are customized for bert-base-uncased.
     --do_onnx_export                Boolean denoting if the model should be exported to onnx
     --onnx_export_path              Path where onnx model path will be exported. ex: onnx-export
 
@@ -101,7 +101,7 @@
     --seed 42 \
     --nm_prune_config prune_config_files/95sparsity1epoch.yaml \
     --do_onnx_export \
-    --onnx_export_path 95sparsity1epoch/ 
+    --onnx_export_path 95sparsity1epoch/
 """
 import collections
 import json
@@ -590,7 +590,7 @@ def prepare_validation_features(examples):
 
         return tokenized_examples
 
-    transformers.utils.logging.set_verbosity_info()   
+    transformers.utils.logging.set_verbosity_info()
     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
@@ -618,7 +618,7 @@ def prepare_validation_features(examples):
     )
 
     logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
     )
 
@@ -663,7 +663,7 @@ def prepare_validation_features(examples):
 
     model_parameters = filter(lambda p: p.requires_grad, model.parameters())
     params = sum([np.prod(p.size()) for p in model_parameters])
-    logger.info("Model has %s parameters", params)    
+    logger.info("Model has %s parameters", params)
     # Tokenizer check: this script requires a fast tokenizer.
     if not isinstance(tokenizer, PreTrainedTokenizerFast):
         raise ValueError(
@@ -679,7 +679,7 @@ def prepare_validation_features(examples):
     question_column_name = "question" if "question" in column_names else column_names[0]
     context_column_name = "context" if "context" in column_names else column_names[1]
     answer_column_name = "answers" if "answers" in column_names else column_names[2]
-    pad_on_right = tokenizer.padding_side == "right"  
+    pad_on_right = tokenizer.padding_side == "right"
 
     if training_args.do_train:
         train_dataset = datasets["train"].map(
@@ -714,12 +714,13 @@ def prepare_validation_features(examples):
 
     ####################################################################################
     # Start SparseML Integration
-    #################################################################################### 
-    optim = load_optimizer(model, TrainingArguments)
-    steps_per_epoch = math.ceil(len(datasets["train"]) / (training_args.per_device_train_batch_size*training_args._n_gpu))
-    manager = ScheduledModifierManager.from_yaml(data_args.nm_prune_config)
-    training_args.num_train_epochs = float(manager.max_epochs)
-    optim = ScheduledOptimizer(optim, model, manager, steps_per_epoch=steps_per_epoch, loggers=None)
+    ####################################################################################
+    if training_args.do_train:
+        optim = load_optimizer(model, TrainingArguments)
+        steps_per_epoch = math.ceil(len(train_dataset) / (training_args.per_device_train_batch_size * training_args._n_gpu))
+        manager = ScheduledModifierManager.from_yaml(data_args.nm_prune_config)
+        training_args.num_train_epochs = float(manager.max_epochs)
+        optim = ScheduledOptimizer(optim, model, manager, steps_per_epoch=steps_per_epoch, loggers=None)
     ####################################################################################
     # End SparseML Integration
     ####################################################################################
@@ -734,7 +735,7 @@ def prepare_validation_features(examples):
         data_collator=data_collator,
         post_process_function=post_processing_function,
         compute_metrics=compute_metrics,
-        optimizers=(optim, None),
+        optimizers=(optim, None) if training_args.do_train else (None, None),
     )
 
     # Training
@@ -765,7 +766,7 @@ def prepare_validation_features(examples):
     ####################################################################################
     if data_args.do_onnx_export:
         logger.info("*** Export to ONNX ***")
-        print("Exporting onnx model") 
+        print("Exporting onnx model")
         os.environ["TOKENIZERS_PARALLELISM"] = "false"
         exporter = ModuleExporter(
             model, output_dir='onnx-export'