From 5009ea47a90a1f241f0f40cc1ba5cd2e64df3b4d Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Tue, 7 Nov 2023 02:21:26 +0000 Subject: [PATCH 1/7] upgrade torch and torchvision --- setup.cfg | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.cfg b/setup.cfg index a7ce5ebb2..d158ecb54 100644 --- a/setup.cfg +++ b/setup.cfg @@ -131,13 +131,13 @@ jax_gpu = # PyTorch CPU pytorch_cpu = - torch==2.0.1 - torchvision==0.15.2 + torch==2.1.0 + torchvision=0.16.0 # PyTorch GPU pytorch_gpu = - torch==2.0.1+cu118 - torchvision==0.15.2+cu118 + torch==2.1.0+cu118 + torchvision==0.16.0+cu118 # wandb wandb = From 9cab9b77a2fbdc6dc8f9a2a4bf66fb5714080653 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Tue, 7 Nov 2023 02:23:01 +0000 Subject: [PATCH 2/7] fix typo --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index d158ecb54..fb0a32eae 100644 --- a/setup.cfg +++ b/setup.cfg @@ -132,7 +132,7 @@ jax_gpu = # PyTorch CPU pytorch_cpu = torch==2.1.0 - torchvision=0.16.0 + torchvision==0.16.0 # PyTorch GPU pytorch_gpu = From 04125519e3a0c1a699b7baa30b91ee14c6ccf331 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Tue, 7 Nov 2023 20:54:04 +0000 Subject: [PATCH 3/7] modify pytorch settings for conformer and vit --- submission_runner.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/submission_runner.py b/submission_runner.py index d92732145..b4b3a2616 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -220,7 +220,7 @@ def train_once( model_params, model_state = workload.init_model_fn( model_init_rng, dropout_rate, aux_dropout_rate) if FLAGS.framework == 'pytorch' and FLAGS.torch_compile: - compile_error_workloads = ['librispeech_conformer', 'ogbg', 'criteo1tb'] + compile_error_workloads = ['librispeech_conformer', 'ogbg', 'criteo1tb', 'imagenet_vit'] eager_backend_workloads = ['librispeech_deepspeech'] aot_eager_backend_workloads = [] if FLAGS.workload in compile_error_workloads: @@ -603,7 +603,10 @@ def main(_): # Prevent OOM on librispeech conformer. if FLAGS.workload == 'librispeech_conformer': - os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION'] = '0.85' + if FLAGS.framework = 'jax' + os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION'] = '0.85' + elif FLAGS.framework == 'pytorch' and torch.cuda.is_available(): + torch.cuda.memory._set_allocator_settings('expandable_segments:True') if FLAGS.set_pytorch_max_split_size: os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:256' From b41fe87f645f6b560d3597856bb61b69d650d74e Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Tue, 7 Nov 2023 20:56:19 +0000 Subject: [PATCH 4/7] syntax fix --- submission_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submission_runner.py b/submission_runner.py index b4b3a2616..b54513a50 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -603,7 +603,7 @@ def main(_): # Prevent OOM on librispeech conformer. if FLAGS.workload == 'librispeech_conformer': - if FLAGS.framework = 'jax' + if FLAGS.framework == 'jax': os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION'] = '0.85' elif FLAGS.framework == 'pytorch' and torch.cuda.is_available(): torch.cuda.memory._set_allocator_settings('expandable_segments:True') From 8d99197bb479aa2cccadadd4f0def63d5c39c176 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Wed, 8 Nov 2023 01:18:08 +0000 Subject: [PATCH 5/7] formatting --- submission_runner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/submission_runner.py b/submission_runner.py index b54513a50..7c7b1371f 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -220,7 +220,9 @@ def train_once( model_params, model_state = workload.init_model_fn( model_init_rng, dropout_rate, aux_dropout_rate) if FLAGS.framework == 'pytorch' and FLAGS.torch_compile: - compile_error_workloads = ['librispeech_conformer', 'ogbg', 'criteo1tb', 'imagenet_vit'] + compile_error_workloads = [ + 'librispeech_conformer', 'ogbg', 'criteo1tb', 'imagenet_vit' + ] eager_backend_workloads = ['librispeech_deepspeech'] aot_eager_backend_workloads = [] if FLAGS.workload in compile_error_workloads: From aae73f29e2f4c74900fe049380660fdcfe1b0c94 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Fri, 10 Nov 2023 03:36:08 +0000 Subject: [PATCH 6/7] remove mem allocation option for conformer --- submission_runner.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/submission_runner.py b/submission_runner.py index 7c7b1371f..4b8d589da 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -605,10 +605,7 @@ def main(_): # Prevent OOM on librispeech conformer. if FLAGS.workload == 'librispeech_conformer': - if FLAGS.framework == 'jax': - os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION'] = '0.85' - elif FLAGS.framework == 'pytorch' and torch.cuda.is_available(): - torch.cuda.memory._set_allocator_settings('expandable_segments:True') + os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION'] = '0.85' if FLAGS.set_pytorch_max_split_size: os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:256' From f5170c5ceffe02c306301bb7c3a65e26a7dfa3cf Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 16 Nov 2023 07:26:17 +0000 Subject: [PATCH 7/7] extract fix --- datasets/dataset_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py index 2f808b64b..f9ee2f138 100644 --- a/datasets/dataset_setup.py +++ b/datasets/dataset_setup.py @@ -388,7 +388,7 @@ def download_fastmri(data_dir, def extract(source, dest, mode='r:xz'): if not os.path.exists(dest): - os.path.makedirs(dest) + os.makedirs(dest) logging.info(f'Extracting {source} to {dest}') tar = tarfile.open(source, mode) logging.info('Opened tar')