From 04a83802d620c6237a257f5dea5f6781c8aa6d2a Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Wed, 13 Sep 2023 00:12:48 +0000 Subject: [PATCH 01/10] fix arg for deletion prompt --- datasets/dataset_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py index 8fc442b76..77885d660 100644 --- a/datasets/dataset_setup.py +++ b/datasets/dataset_setup.py @@ -298,7 +298,7 @@ def download_criteo1tb(data_dir, logging.info(f'Running Criteo 1TB unzip command:\n{unzip_cmd}') p = subprocess.Popen(unzip_cmd, shell=True) p.communicate() - _maybe_prompt_for_deletion(all_days_zip_filepath, interactive_deletion) + _maybe_prompt_for_deletion([all_days_zip_filepath], interactive_deletion) # Unzip the individual days. processes = [] From 1a3679d3f25e1289dcc3615a43fbf68a58959c5c Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Wed, 13 Sep 2023 17:29:49 +0000 Subject: [PATCH 02/10] move delete prompt to end of criteo download --- datasets/dataset_setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py index 77885d660..dfcecbd85 100644 --- a/datasets/dataset_setup.py +++ b/datasets/dataset_setup.py @@ -316,9 +316,9 @@ def download_criteo1tb(data_dir, _maybe_prompt_for_deletion(gz_paths, interactive_deletion) # Split into files with 5M lines each: day_1.csv -> day_1_[0-39].csv. + unzipped_paths = [] for batch in range(6): batch_processes = [] - unzipped_paths = [] for day_offset in range(4): day = batch * 4 + day_offset unzipped_path = os.path.join(criteo_dir, f'day_{day}.csv') @@ -330,7 +330,7 @@ def download_criteo1tb(data_dir, batch_processes.append(subprocess.Popen(split_cmd, shell=True)) for p in batch_processes: p.communicate() - _maybe_prompt_for_deletion(unzipped_paths, interactive_deletion) + _maybe_prompt_for_deletion(unzipped_paths, interactive_deletion) def download_cifar(data_dir, framework): From efdd670c336b74c4a06c59f47d4fa27013a045e1 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 14 Sep 2023 19:53:56 +0000 Subject: [PATCH 03/10] librispeech processing --- datasets/dataset_setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py index dfcecbd85..8099fe7cc 100644 --- a/datasets/dataset_setup.py +++ b/datasets/dataset_setup.py @@ -584,7 +584,7 @@ def download_librispeech(dataset_dir, tmp_dir): subprocess.Popen(wget_cmd, shell=True).communicate() tar_path = os.path.join(tmp_librispeech_dir, f'{split}-{version}.tar.gz') subprocess.Popen( - f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', + f'tar xzvf {tar_path} --directory {extracted_data_dir}', shell=True).communicate() tars = [ @@ -599,7 +599,7 @@ def download_librispeech(dataset_dir, tmp_dir): subprocess.Popen(wget_cmd, shell=True).communicate() tar_path = os.path.join(tmp_librispeech_dir, tar_filename) subprocess.Popen( - f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', + f'tar xzvf {tar_path} --directory {extracted_data_dir}', shell=True).communicate() tokenizer_vocab_path = os.path.join(extracted_data_dir, 'spm_model.vocab') From 26713bcbed056c9c6232c5c998a1aa5b186856da Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 14 Sep 2023 20:27:12 +0000 Subject: [PATCH 04/10] fix --- datasets/dataset_setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py index 8099fe7cc..71efa3434 100644 --- a/datasets/dataset_setup.py +++ b/datasets/dataset_setup.py @@ -575,6 +575,8 @@ def download_librispeech(dataset_dir, tmp_dir): final_data_dir = os.path.join(dataset_dir, 'librispeech') _maybe_mkdir(tmp_librispeech_dir) + _maybe_mkdir(extracted_data_dir) + _maybe_mkdir(final_data_dir) for split in ['dev', 'test']: for version in ['clean', 'other']: From f3881daf7775f268f86892163e49f0d42986dc11 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 14 Sep 2023 20:49:49 +0000 Subject: [PATCH 05/10] librispeech fix --- datasets/dataset_setup.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py index 71efa3434..b6ae48378 100644 --- a/datasets/dataset_setup.py +++ b/datasets/dataset_setup.py @@ -571,11 +571,10 @@ def download_librispeech(dataset_dir, tmp_dir): # extracted_data_dir = os.path.join(tmp_librispeech_dir, 'LibriSpeech') # final_data_dir = os.path.join(dataset_dir, 'librispeech_processed') tmp_librispeech_dir = os.path.join(tmp_dir, 'librispeech_raw') - extracted_data_dir = os.path.join(tmp_dir, 'librispeech_extracted') + extracted_data_dir = os.path.join(tmp_dir, 'LibriSpeech) final_data_dir = os.path.join(dataset_dir, 'librispeech') _maybe_mkdir(tmp_librispeech_dir) - _maybe_mkdir(extracted_data_dir) _maybe_mkdir(final_data_dir) for split in ['dev', 'test']: @@ -586,7 +585,7 @@ def download_librispeech(dataset_dir, tmp_dir): subprocess.Popen(wget_cmd, shell=True).communicate() tar_path = os.path.join(tmp_librispeech_dir, f'{split}-{version}.tar.gz') subprocess.Popen( - f'tar xzvf {tar_path} --directory {extracted_data_dir}', + f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', shell=True).communicate() tars = [ @@ -601,7 +600,7 @@ def download_librispeech(dataset_dir, tmp_dir): subprocess.Popen(wget_cmd, shell=True).communicate() tar_path = os.path.join(tmp_librispeech_dir, tar_filename) subprocess.Popen( - f'tar xzvf {tar_path} --directory {extracted_data_dir}', + f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}', shell=True).communicate() tokenizer_vocab_path = os.path.join(extracted_data_dir, 'spm_model.vocab') From fd710ab40df007640072cc953d5274f27b4057b1 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 14 Sep 2023 20:52:00 +0000 Subject: [PATCH 06/10] syntax fix --- datasets/dataset_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py index b6ae48378..df3ba22fe 100644 --- a/datasets/dataset_setup.py +++ b/datasets/dataset_setup.py @@ -571,7 +571,7 @@ def download_librispeech(dataset_dir, tmp_dir): # extracted_data_dir = os.path.join(tmp_librispeech_dir, 'LibriSpeech') # final_data_dir = os.path.join(dataset_dir, 'librispeech_processed') tmp_librispeech_dir = os.path.join(tmp_dir, 'librispeech_raw') - extracted_data_dir = os.path.join(tmp_dir, 'LibriSpeech) + extracted_data_dir = os.path.join(tmp_dir, 'LibriSpeech') final_data_dir = os.path.join(dataset_dir, 'librispeech') _maybe_mkdir(tmp_librispeech_dir) From fa0862601c72fa657662562366e59243807895a6 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 14 Sep 2023 23:25:33 +0000 Subject: [PATCH 07/10] fix --- datasets/dataset_setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py index df3ba22fe..fe5e2a9a0 100644 --- a/datasets/dataset_setup.py +++ b/datasets/dataset_setup.py @@ -570,8 +570,8 @@ def download_librispeech(dataset_dir, tmp_dir): # tmp_librispeech_dir = os.path.join(dataset_dir, 'librispeech') # extracted_data_dir = os.path.join(tmp_librispeech_dir, 'LibriSpeech') # final_data_dir = os.path.join(dataset_dir, 'librispeech_processed') - tmp_librispeech_dir = os.path.join(tmp_dir, 'librispeech_raw') - extracted_data_dir = os.path.join(tmp_dir, 'LibriSpeech') + tmp_librispeech_dir = os.path.join(tmp_dir, 'librispeech') + extracted_data_dir = os.path.join(tmp_librispeech_dir, 'LibriSpeech') final_data_dir = os.path.join(dataset_dir, 'librispeech') _maybe_mkdir(tmp_librispeech_dir) From e9119b9f5084827820a631c3dfd57aa73f427c7a Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Thu, 14 Sep 2023 23:26:20 +0000 Subject: [PATCH 08/10] documentation --- datasets/dataset_setup.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py index fe5e2a9a0..e7f8c1d13 100644 --- a/datasets/dataset_setup.py +++ b/datasets/dataset_setup.py @@ -567,9 +567,6 @@ def download_librispeech(dataset_dir, tmp_dir): # After extraction the result is a folder named Librispeech containing audio # files in .flac format along with transcripts containing name of audio file # and corresponding transcription. - # tmp_librispeech_dir = os.path.join(dataset_dir, 'librispeech') - # extracted_data_dir = os.path.join(tmp_librispeech_dir, 'LibriSpeech') - # final_data_dir = os.path.join(dataset_dir, 'librispeech_processed') tmp_librispeech_dir = os.path.join(tmp_dir, 'librispeech') extracted_data_dir = os.path.join(tmp_librispeech_dir, 'LibriSpeech') final_data_dir = os.path.join(dataset_dir, 'librispeech') From ae9d46f0e7fea1fe77b41d8a0ce94099f121bf5b Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Fri, 15 Sep 2023 00:29:56 +0000 Subject: [PATCH 09/10] typo fix --- datasets/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/README.md b/datasets/README.md index 93d7d4b9e..5ff0e18a7 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -100,7 +100,7 @@ python3 datasets/dataset_setup.py \ --imagenet \ --temp_dir $DATA_DIR/tmp \ --imagenet_train_url \ ---imagenet_val_url \ --framework jax ``` From 241e546dc1b737e066a054cac68b41a43c8da921 Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Fri, 15 Sep 2023 17:53:10 +0000 Subject: [PATCH 10/10] add test-other counts to librispeech preprocessing --- datasets/librispeech_preprocess.py | 1 + 1 file changed, 1 insertion(+) diff --git a/datasets/librispeech_preprocess.py b/datasets/librispeech_preprocess.py index 0968f2a00..acdaa8e98 100644 --- a/datasets/librispeech_preprocess.py +++ b/datasets/librispeech_preprocess.py @@ -32,6 +32,7 @@ 'train-clean-360': 104014, 'train-other-500': 148688, 'test-clean': 2620, + 'test-other': 2939, 'dev-clean': 2703, 'dev-other': 2864, }