microsoft · cp5555 · Sep 27, 2021 · Sep 27, 2021
@@ -124,11 +124,27 @@ def add_parser_arguments(self):
             help='Enable option to pin memory in data loader.',
         )
 
+        self._parser.add_argument(
+            '--force_fp32',
+            action='store_true',
+            default=False,
+            help='Enable option to use full float32 precision.',
+        )
+
     @abstractmethod
     def _judge_gpu_availability(self):
         """Judge GPUs' availability according to arguments and running environment."""
         pass
 
+    @abstractmethod
+    def _set_force_fp32(self):
+        """Set the config that controls whether full float32 precision will be used.
+
+        On Ampere or newer GPUs, pytorch and tensorflow will use TF32 instead of FP32 by default.
+        We can disable TF32 execution by setting force_fp32 as True.
+        """
+        pass
+
     @abstractmethod
     def _init_distributed_setting(self):
         """Initialize the distributed library and bind the worker to GPU.
@@ -166,9 +182,10 @@ def _preprocess(self):
             return False
 
         self._judge_gpu_availability()
+        self._set_force_fp32()
         logger.info(
-            'Model placement - model: {}, GPU availablility: {}, pin memory: {}.'.format(
-                self._name, self._gpu_available, self._args.pin_memory
+            'Model placement - model: {}, GPU availablility: {}, pin memory: {}, force fp32: {}.'.format(
+                self._name, self._gpu_available, self._args.pin_memory, self._args.force_fp32
             )
         )
 

@@ -32,6 +32,15 @@ def _judge_gpu_availability(self):
         """Judge GPUs' availability according to arguments and running environment."""
         self._gpu_available = not self._args.no_gpu and torch.cuda.is_available()
 
+    def _set_force_fp32(self):
+        """Set the config that controls whether full float32 precision will be used.
+
+        On Ampere or newer GPUs, pytorch and tensorflow will use TF32 instead of FP32 by default.
+        We can disable TF32 execution by setting force_fp32 as True.
+        """
+        torch.backends.cuda.matmul.allow_tf32 = self._args.force_fp32
+        torch.backends.cudnn.allow_tf32 = self._args.force_fp32
+
     def _init_distributed_setting(self):
         """Initialize the distributed library and bind the worker to GPU.
 

@@ -42,6 +42,10 @@ def _judge_gpu_availability(self):
         """Judge GPUs' availability according to arguments and running environment."""
         self._gpu_available = False
 
+    def _set_force_fp32(self):
+        """Set the config that controls whether full float32 precision will be used."""
+        pass
+
     def _init_distributed_setting(self):
         """Initialize the distributed library and bind the worker to GPU."""
         return True
@@ -161,6 +165,7 @@ def test_arguments_related_interfaces():
                         Distributed backends. E.g. nccl mpi gloo.
   --no_gpu              Disable GPU training.
   --pin_memory          Enable option to pin memory in data loader.
+  --force_fp32          Enable option to use full float32 precision.
   --hidden_size int     Hidden size.
   --seq_len int         Sequence length."""
     )
@@ -194,6 +199,7 @@ def test_preprocess():
                         Distributed backends. E.g. nccl mpi gloo.
   --no_gpu              Disable GPU training.
   --pin_memory          Enable option to pin memory in data loader.
+  --force_fp32          Enable option to use full float32 precision.
   --hidden_size int     Hidden size.
   --seq_len int         Sequence length."""
     )

@@ -178,7 +178,7 @@ def test_pytorch_base():
     BenchmarkRegistry.register_benchmark('pytorch-mnist', PytorchMNIST)
 
     # Launch benchmark with --no_gpu for testing.
-    parameters = '--batch_size 32 --num_warmup 8 --num_steps 64 --model_action train inference --no_gpu'
+    parameters = '--batch_size 32 --num_warmup 8 --num_steps 64 --model_action train inference --no_gpu --force_fp32'
     benchmark = PytorchMNIST('pytorch-mnist', parameters=parameters)
     assert (benchmark)
     assert (benchmark._preprocess())
@@ -202,6 +202,9 @@ def test_pytorch_base():
     # Test _judge_gpu_availability().
     assert (benchmark._gpu_available is False)
 
+    # Test _set_force_fp32().
+    assert (benchmark._args.force_fp32 is True)
+
     # Test _init_distributed_setting().
     assert (benchmark._args.distributed_impl is None)
     assert (benchmark._args.distributed_backend is None)