diff --git a/superbench/benchmarks/model_benchmarks/model_base.py b/superbench/benchmarks/model_benchmarks/model_base.py index 15be8fdc4..451c4c246 100644 --- a/superbench/benchmarks/model_benchmarks/model_base.py +++ b/superbench/benchmarks/model_benchmarks/model_base.py @@ -124,11 +124,27 @@ def add_parser_arguments(self): help='Enable option to pin memory in data loader.', ) + self._parser.add_argument( + '--force_fp32', + action='store_true', + default=False, + help='Enable option to use full float32 precision.', + ) + @abstractmethod def _judge_gpu_availability(self): """Judge GPUs' availability according to arguments and running environment.""" pass + @abstractmethod + def _set_force_fp32(self): + """Set the config that controls whether full float32 precision will be used. + + On Ampere or newer GPUs, pytorch and tensorflow will use TF32 instead of FP32 by default. + We can disable TF32 execution by setting force_fp32 as True. + """ + pass + @abstractmethod def _init_distributed_setting(self): """Initialize the distributed library and bind the worker to GPU. @@ -166,9 +182,10 @@ def _preprocess(self): return False self._judge_gpu_availability() + self._set_force_fp32() logger.info( - 'Model placement - model: {}, GPU availablility: {}, pin memory: {}.'.format( - self._name, self._gpu_available, self._args.pin_memory + 'Model placement - model: {}, GPU availablility: {}, pin memory: {}, force fp32: {}.'.format( + self._name, self._gpu_available, self._args.pin_memory, self._args.force_fp32 ) ) diff --git a/superbench/benchmarks/model_benchmarks/pytorch_base.py b/superbench/benchmarks/model_benchmarks/pytorch_base.py index 0497dd750..916985c8b 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_base.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py @@ -32,6 +32,15 @@ def _judge_gpu_availability(self): """Judge GPUs' availability according to arguments and running environment.""" self._gpu_available = not self._args.no_gpu and torch.cuda.is_available() + def _set_force_fp32(self): + """Set the config that controls whether full float32 precision will be used. + + On Ampere or newer GPUs, pytorch and tensorflow will use TF32 instead of FP32 by default. + We can disable TF32 execution by setting force_fp32 as True. + """ + torch.backends.cuda.matmul.allow_tf32 = self._args.force_fp32 + torch.backends.cudnn.allow_tf32 = self._args.force_fp32 + def _init_distributed_setting(self): """Initialize the distributed library and bind the worker to GPU. diff --git a/tests/benchmarks/model_benchmarks/test_model_base.py b/tests/benchmarks/model_benchmarks/test_model_base.py index 02e480ede..6d000fb86 100644 --- a/tests/benchmarks/model_benchmarks/test_model_base.py +++ b/tests/benchmarks/model_benchmarks/test_model_base.py @@ -42,6 +42,10 @@ def _judge_gpu_availability(self): """Judge GPUs' availability according to arguments and running environment.""" self._gpu_available = False + def _set_force_fp32(self): + """Set the config that controls whether full float32 precision will be used.""" + pass + def _init_distributed_setting(self): """Initialize the distributed library and bind the worker to GPU.""" return True @@ -161,6 +165,7 @@ def test_arguments_related_interfaces(): Distributed backends. E.g. nccl mpi gloo. --no_gpu Disable GPU training. --pin_memory Enable option to pin memory in data loader. + --force_fp32 Enable option to use full float32 precision. --hidden_size int Hidden size. --seq_len int Sequence length.""" ) @@ -194,6 +199,7 @@ def test_preprocess(): Distributed backends. E.g. nccl mpi gloo. --no_gpu Disable GPU training. --pin_memory Enable option to pin memory in data loader. + --force_fp32 Enable option to use full float32 precision. --hidden_size int Hidden size. --seq_len int Sequence length.""" ) diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_base.py b/tests/benchmarks/model_benchmarks/test_pytorch_base.py index 44b3603c8..0db5db0c1 100644 --- a/tests/benchmarks/model_benchmarks/test_pytorch_base.py +++ b/tests/benchmarks/model_benchmarks/test_pytorch_base.py @@ -178,7 +178,7 @@ def test_pytorch_base(): BenchmarkRegistry.register_benchmark('pytorch-mnist', PytorchMNIST) # Launch benchmark with --no_gpu for testing. - parameters = '--batch_size 32 --num_warmup 8 --num_steps 64 --model_action train inference --no_gpu' + parameters = '--batch_size 32 --num_warmup 8 --num_steps 64 --model_action train inference --no_gpu --force_fp32' benchmark = PytorchMNIST('pytorch-mnist', parameters=parameters) assert (benchmark) assert (benchmark._preprocess()) @@ -202,6 +202,9 @@ def test_pytorch_base(): # Test _judge_gpu_availability(). assert (benchmark._gpu_available is False) + # Test _set_force_fp32(). + assert (benchmark._args.force_fp32 is True) + # Test _init_distributed_setting(). assert (benchmark._args.distributed_impl is None) assert (benchmark._args.distributed_backend is None)