From 04a83802d620c6237a257f5dea5f6781c8aa6d2a Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Wed, 13 Sep 2023 00:12:48 +0000
Subject: [PATCH 01/10] fix arg for deletion prompt

---
 datasets/dataset_setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py
index 8fc442b76..77885d660 100644
--- a/datasets/dataset_setup.py
+++ b/datasets/dataset_setup.py
@@ -298,7 +298,7 @@ def download_criteo1tb(data_dir,
   logging.info(f'Running Criteo 1TB unzip command:\n{unzip_cmd}')
   p = subprocess.Popen(unzip_cmd, shell=True)
   p.communicate()
-  _maybe_prompt_for_deletion(all_days_zip_filepath, interactive_deletion)
+  _maybe_prompt_for_deletion([all_days_zip_filepath], interactive_deletion)
 
   # Unzip the individual days.
   processes = []

From 1a3679d3f25e1289dcc3615a43fbf68a58959c5c Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Wed, 13 Sep 2023 17:29:49 +0000
Subject: [PATCH 02/10] move delete prompt to end of criteo download

---
 datasets/dataset_setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py
index 77885d660..dfcecbd85 100644
--- a/datasets/dataset_setup.py
+++ b/datasets/dataset_setup.py
@@ -316,9 +316,9 @@ def download_criteo1tb(data_dir,
   _maybe_prompt_for_deletion(gz_paths, interactive_deletion)
 
   # Split into files with 5M lines each: day_1.csv -> day_1_[0-39].csv.
+  unzipped_paths = []
   for batch in range(6):
     batch_processes = []
-    unzipped_paths = []
     for day_offset in range(4):
       day = batch * 4 + day_offset
       unzipped_path = os.path.join(criteo_dir, f'day_{day}.csv')
@@ -330,7 +330,7 @@ def download_criteo1tb(data_dir,
       batch_processes.append(subprocess.Popen(split_cmd, shell=True))
     for p in batch_processes:
       p.communicate()
-    _maybe_prompt_for_deletion(unzipped_paths, interactive_deletion)
+  _maybe_prompt_for_deletion(unzipped_paths, interactive_deletion)
 
 
 def download_cifar(data_dir, framework):

From efdd670c336b74c4a06c59f47d4fa27013a045e1 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Thu, 14 Sep 2023 19:53:56 +0000
Subject: [PATCH 03/10] librispeech processing

---
 datasets/dataset_setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py
index dfcecbd85..8099fe7cc 100644
--- a/datasets/dataset_setup.py
+++ b/datasets/dataset_setup.py
@@ -584,7 +584,7 @@ def download_librispeech(dataset_dir, tmp_dir):
       subprocess.Popen(wget_cmd, shell=True).communicate()
       tar_path = os.path.join(tmp_librispeech_dir, f'{split}-{version}.tar.gz')
       subprocess.Popen(
-          f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}',
+          f'tar xzvf {tar_path} --directory {extracted_data_dir}',
           shell=True).communicate()
 
   tars = [
@@ -599,7 +599,7 @@ def download_librispeech(dataset_dir, tmp_dir):
     subprocess.Popen(wget_cmd, shell=True).communicate()
     tar_path = os.path.join(tmp_librispeech_dir, tar_filename)
     subprocess.Popen(
-        f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}',
+        f'tar xzvf {tar_path} --directory {extracted_data_dir}',
         shell=True).communicate()
 
   tokenizer_vocab_path = os.path.join(extracted_data_dir, 'spm_model.vocab')

From 26713bcbed056c9c6232c5c998a1aa5b186856da Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Thu, 14 Sep 2023 20:27:12 +0000
Subject: [PATCH 04/10] fix

---
 datasets/dataset_setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py
index 8099fe7cc..71efa3434 100644
--- a/datasets/dataset_setup.py
+++ b/datasets/dataset_setup.py
@@ -575,6 +575,8 @@ def download_librispeech(dataset_dir, tmp_dir):
   final_data_dir = os.path.join(dataset_dir, 'librispeech')
 
   _maybe_mkdir(tmp_librispeech_dir)
+  _maybe_mkdir(extracted_data_dir)
+  _maybe_mkdir(final_data_dir)
 
   for split in ['dev', 'test']:
     for version in ['clean', 'other']:

From f3881daf7775f268f86892163e49f0d42986dc11 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Thu, 14 Sep 2023 20:49:49 +0000
Subject: [PATCH 05/10] librispeech fix

---
 datasets/dataset_setup.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py
index 71efa3434..b6ae48378 100644
--- a/datasets/dataset_setup.py
+++ b/datasets/dataset_setup.py
@@ -571,11 +571,10 @@ def download_librispeech(dataset_dir, tmp_dir):
   # extracted_data_dir = os.path.join(tmp_librispeech_dir, 'LibriSpeech')
   # final_data_dir = os.path.join(dataset_dir, 'librispeech_processed')
   tmp_librispeech_dir = os.path.join(tmp_dir, 'librispeech_raw')
-  extracted_data_dir = os.path.join(tmp_dir, 'librispeech_extracted')
+  extracted_data_dir = os.path.join(tmp_dir, 'LibriSpeech)
   final_data_dir = os.path.join(dataset_dir, 'librispeech')
 
   _maybe_mkdir(tmp_librispeech_dir)
-  _maybe_mkdir(extracted_data_dir)
   _maybe_mkdir(final_data_dir)
 
   for split in ['dev', 'test']:
@@ -586,7 +585,7 @@ def download_librispeech(dataset_dir, tmp_dir):
       subprocess.Popen(wget_cmd, shell=True).communicate()
       tar_path = os.path.join(tmp_librispeech_dir, f'{split}-{version}.tar.gz')
       subprocess.Popen(
-          f'tar xzvf {tar_path} --directory {extracted_data_dir}',
+          f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}',
           shell=True).communicate()
 
   tars = [
@@ -601,7 +600,7 @@ def download_librispeech(dataset_dir, tmp_dir):
     subprocess.Popen(wget_cmd, shell=True).communicate()
     tar_path = os.path.join(tmp_librispeech_dir, tar_filename)
     subprocess.Popen(
-        f'tar xzvf {tar_path} --directory {extracted_data_dir}',
+        f'tar xzvf {tar_path} --directory {tmp_librispeech_dir}',
         shell=True).communicate()
 
   tokenizer_vocab_path = os.path.join(extracted_data_dir, 'spm_model.vocab')

From fd710ab40df007640072cc953d5274f27b4057b1 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Thu, 14 Sep 2023 20:52:00 +0000
Subject: [PATCH 06/10] syntax  fix

---
 datasets/dataset_setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py
index b6ae48378..df3ba22fe 100644
--- a/datasets/dataset_setup.py
+++ b/datasets/dataset_setup.py
@@ -571,7 +571,7 @@ def download_librispeech(dataset_dir, tmp_dir):
   # extracted_data_dir = os.path.join(tmp_librispeech_dir, 'LibriSpeech')
   # final_data_dir = os.path.join(dataset_dir, 'librispeech_processed')
   tmp_librispeech_dir = os.path.join(tmp_dir, 'librispeech_raw')
-  extracted_data_dir = os.path.join(tmp_dir, 'LibriSpeech)
+  extracted_data_dir = os.path.join(tmp_dir, 'LibriSpeech')
   final_data_dir = os.path.join(dataset_dir, 'librispeech')
 
   _maybe_mkdir(tmp_librispeech_dir)

From fa0862601c72fa657662562366e59243807895a6 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Thu, 14 Sep 2023 23:25:33 +0000
Subject: [PATCH 07/10] fix

---
 datasets/dataset_setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py
index df3ba22fe..fe5e2a9a0 100644
--- a/datasets/dataset_setup.py
+++ b/datasets/dataset_setup.py
@@ -570,8 +570,8 @@ def download_librispeech(dataset_dir, tmp_dir):
   # tmp_librispeech_dir = os.path.join(dataset_dir, 'librispeech')
   # extracted_data_dir = os.path.join(tmp_librispeech_dir, 'LibriSpeech')
   # final_data_dir = os.path.join(dataset_dir, 'librispeech_processed')
-  tmp_librispeech_dir = os.path.join(tmp_dir, 'librispeech_raw')
-  extracted_data_dir = os.path.join(tmp_dir, 'LibriSpeech')
+  tmp_librispeech_dir = os.path.join(tmp_dir, 'librispeech')
+  extracted_data_dir = os.path.join(tmp_librispeech_dir, 'LibriSpeech')
   final_data_dir = os.path.join(dataset_dir, 'librispeech')
 
   _maybe_mkdir(tmp_librispeech_dir)

From e9119b9f5084827820a631c3dfd57aa73f427c7a Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Thu, 14 Sep 2023 23:26:20 +0000
Subject: [PATCH 08/10] documentation

---
 datasets/dataset_setup.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/datasets/dataset_setup.py b/datasets/dataset_setup.py
index fe5e2a9a0..e7f8c1d13 100644
--- a/datasets/dataset_setup.py
+++ b/datasets/dataset_setup.py
@@ -567,9 +567,6 @@ def download_librispeech(dataset_dir, tmp_dir):
   # After extraction the result is a folder named Librispeech containing audio
   # files in .flac format along with transcripts containing name of audio file
   # and corresponding transcription.
-  # tmp_librispeech_dir = os.path.join(dataset_dir, 'librispeech')
-  # extracted_data_dir = os.path.join(tmp_librispeech_dir, 'LibriSpeech')
-  # final_data_dir = os.path.join(dataset_dir, 'librispeech_processed')
   tmp_librispeech_dir = os.path.join(tmp_dir, 'librispeech')
   extracted_data_dir = os.path.join(tmp_librispeech_dir, 'LibriSpeech')
   final_data_dir = os.path.join(dataset_dir, 'librispeech')

From ae9d46f0e7fea1fe77b41d8a0ce94099f121bf5b Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Fri, 15 Sep 2023 00:29:56 +0000
Subject: [PATCH 09/10] typo fix

---
 datasets/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/README.md b/datasets/README.md
index 93d7d4b9e..5ff0e18a7 100644
--- a/datasets/README.md
+++ b/datasets/README.md
@@ -100,7 +100,7 @@ python3 datasets/dataset_setup.py \
 --imagenet \
 --temp_dir $DATA_DIR/tmp \  
 --imagenet_train_url <imagenet_train_url> \
---imagenet_val_url <imagenet_val_url\
+--imagenet_val_url <imagenet_val_url> \
 --framework jax
 
 ```

From 241e546dc1b737e066a054cac68b41a43c8da921 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Fri, 15 Sep 2023 17:53:10 +0000
Subject: [PATCH 10/10] add test-other counts to librispeech preprocessing

---
 datasets/librispeech_preprocess.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/datasets/librispeech_preprocess.py b/datasets/librispeech_preprocess.py
index 0968f2a00..acdaa8e98 100644
--- a/datasets/librispeech_preprocess.py
+++ b/datasets/librispeech_preprocess.py
@@ -32,6 +32,7 @@
     'train-clean-360': 104014,
     'train-other-500': 148688,
     'test-clean': 2620,
+    'test-other': 2939,
     'dev-clean': 2703,
     'dev-other': 2864,
 }