Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
9118b3e
initial commit for r2 downloader
anandhu-eng Jul 4, 2025
5165771
fixes
anandhu-eng Jul 4, 2025
dbbd5a6
add dry run for r2
anandhu-eng Jul 4, 2025
4989779
add support for r2 downloader
anandhu-eng Jul 4, 2025
7b80efd
add r2 downloader for dataset whisper
anandhu-eng Jul 7, 2025
d791f3d
[Automated Commit] Format Codebase [skip ci]
github-actions[bot] Jul 7, 2025
1557d7c
fixes for r2_downloader
anandhu-eng Jul 8, 2025
d96492d
merge changes from upstream
anandhu-eng Jul 8, 2025
59e8cd5
[Automated Commit] Format Codebase [skip ci]
github-actions[bot] Jul 8, 2025
5323939
fix rclone being included multiple times
anandhu-eng Jul 8, 2025
29e9f4c
Merge branch 'r2-downloader' of http://github.com/anandhu-eng/mlperf-…
anandhu-eng Jul 8, 2025
2e0eb20
add support for r2 download
anandhu-eng Jul 8, 2025
d2b6402
fix typo
anandhu-eng Jul 8, 2025
36c7d3a
Add tests for llama3 model
anandhu-eng Jul 8, 2025
834f341
Merge branch 'dev' into r2-downloader
arjunsuresh Jul 8, 2025
0e107db
Merge branch 'dev' into r2-downloader
anandhu-eng Jul 8, 2025
5f475f2
set needs pat to true
anandhu-eng Jul 9, 2025
328cee0
Update meta.yaml
anandhu-eng Jul 9, 2025
228dcac
Update meta.yaml
anandhu-eng Jul 9, 2025
cf5efb3
handle dry run
anandhu-eng Jul 9, 2025
8244a8f
Update meta.yaml
anandhu-eng Jul 9, 2025
f153c62
add r2 download support - cnndm,deepseekr1
anandhu-eng Jul 10, 2025
c7e56f7
Merge branch 'r2-downloader' of http://github.com/anandhu-eng/mlperf-…
anandhu-eng Jul 10, 2025
9dfe1e5
fixes
anandhu-eng Jul 10, 2025
4e2fc2e
fix rclone issue
anandhu-eng Jul 10, 2025
984db0c
skip checking for rclone dry run
anandhu-eng Jul 10, 2025
f202f1f
Merge branch 'dev' into r2-downloader
arjunsuresh Jul 11, 2025
a791711
Merge branch 'dev' into r2-downloader
arjunsuresh Jul 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions script/download-and-extract/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -125,4 +125,9 @@ variations:
download-script:
tags: _wget
group: download-tool
r2_downloader:
add_deps:
download-script:
tags: _r2_downloader
group: download-tool
versions: {}
12 changes: 12 additions & 0 deletions script/download-file/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,18 @@ def preprocess(i):
env['MLC_DOWNLOAD_CMD'] += f" || (({del_cmd} {env['MLC_DOWNLOAD_FILENAME']} || true) && wget -nc {extra_download_options} {url})"
logger.info(f"{env['MLC_DOWNLOAD_CMD']}")

elif tool == "r2_downloader":
env['MLC_DOWNLOAD_CMD'] = f"bash <(curl -s https://raw.githubusercontent.com/mlcommons/r2-downloader/refs/heads/main/mlc-r2-downloader.sh) "
if env["MLC_HOST_OS_TYPE"] == "windows":
# have to modify the variable from url to temp_url if it is
# going to be used anywhere after this point
url = url.replace("%", "%%")
temp_download_file = env['MLC_DOWNLOAD_FILENAME'].replace(
"%", "%%")
else:
temp_download_file = env['MLC_DOWNLOAD_FILENAME']
env['MLC_DOWNLOAD_CMD'] += f" -d {q}{os.path.join(os.getcwd(), temp_download_file)}{q} {extra_download_options} {url}"

elif tool == "curl":
if env.get('MLC_DOWNLOAD_FILENAME', '') != '':
extra_download_options += f" --output {q}{env['MLC_DOWNLOAD_FILENAME']}{q} "
Expand Down
5 changes: 5 additions & 0 deletions script/download-file/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,9 @@ variations:
env:
MLC_DOWNLOAD_TOOL: wget
group: download-tool
r2_downloader:
env:
MLC_DOWNLOAD_TOOL: r2_downloader
group: download-tool

versions: {}
33 changes: 17 additions & 16 deletions script/get-dataset-cnndm/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,23 @@ def preprocess(i):
def postprocess(i):
env = i['env']

if env.get('MLC_TMP_ML_MODEL', '') != "llama3_1-8b":
if is_false(env.get('MLC_DATASET_CALIBRATION', '')):
env['MLC_DATASET_PATH'] = os.path.join(os.getcwd(), 'install')
env['MLC_DATASET_EVAL_PATH'] = os.path.join(
os.getcwd(), 'install', 'cnn_eval.json')
env['MLC_DATASET_CNNDM_EVAL_PATH'] = os.path.join(
os.getcwd(), 'install', 'cnn_eval.json')
env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_DATASET_PATH']
if env.get('MLC_DOWNLOAD_MODE', '') != "dry":
if env.get('MLC_TMP_ML_MODEL', '') != "llama3_1-8b":
if is_false(env.get('MLC_DATASET_CALIBRATION', '')):
env['MLC_DATASET_PATH'] = os.path.join(os.getcwd(), 'install')
env['MLC_DATASET_EVAL_PATH'] = os.path.join(
os.getcwd(), 'install', 'cnn_eval.json')
env['MLC_DATASET_CNNDM_EVAL_PATH'] = os.path.join(
os.getcwd(), 'install', 'cnn_eval.json')
env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_DATASET_PATH']
else:
env['MLC_CALIBRATION_DATASET_PATH'] = os.path.join(
os.getcwd(), 'install', 'cnn_dailymail_calibration.json')
env['MLC_CALIBRATION_DATASET_CNNDM_PATH'] = os.path.join(
os.getcwd(), 'install', 'cnn_dailymail_calibration.json')
env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_CALIBRATION_DATASET_PATH']
else:
env['MLC_CALIBRATION_DATASET_PATH'] = os.path.join(
os.getcwd(), 'install', 'cnn_dailymail_calibration.json')
env['MLC_CALIBRATION_DATASET_CNNDM_PATH'] = os.path.join(
os.getcwd(), 'install', 'cnn_dailymail_calibration.json')
env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_CALIBRATION_DATASET_PATH']
else:
env['MLC_DATASET_CNNDM_EVAL_PATH'] = os.path.join(
env['MLC_DATASET_CNNDM_EVAL_PATH'], env['MLC_DATASET_CNNDM_FILENAME'])
env['MLC_DATASET_CNNDM_EVAL_PATH'] = os.path.join(
env['MLC_DATASET_CNNDM_EVAL_PATH'], env['MLC_DATASET_CNNDM_FILENAME'])

return {'return': 0}
64 changes: 54 additions & 10 deletions script/get-dataset-cnndm/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,29 +58,38 @@ variations:
datacenter:
group: category
rclone:
prehook_deps:
- tags: get,rclone
enable_if_env:
MLC_TMP_REQUIRE_DOWNLOAD:
- yes
- tags: get,rclone-config,_mlc-inference
force_cache: true
enable_if_env:
MLC_TMP_REQUIRE_DOWNLOAD:
- yes
group: download-tool
add_deps_recursive:
dae:
tags: _rclone
r2_downloader:
group: download-tool
add_deps_recursive:
dae:
tags: _r2_downloader
dry-run:
group: run-mode
env:
MLC_DOWNLOAD_MODE: dry
dry-run,rclone:
env:
MLC_DOWNLOAD_EXTRA_OPTIONS: --dry-run
dry-run,r2_downloader:
env:
MLC_DOWNLOAD_EXTRA_OPTIONS: -x
mlc:
group: download-src
prehook_deps:
- tags: get,rclone
enable_if_env:
MLC_TMP_REQUIRE_DOWNLOAD:
- yes
- tags: get,rclone-config,_mlc-inference
force_cache: true
enable_if_env:
MLC_TMP_REQUIRE_DOWNLOAD:
- yes
- enable_if_env:
MLC_TMP_REQUIRE_DOWNLOAD:
- 'yes'
Expand Down Expand Up @@ -126,6 +135,15 @@ variations:
MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/<<<MLC_DATASET_CNNDM_FILENAME>>>
MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_DATASET_CNNDM_EVAL_PATH
MLC_EXTRACT_FINAL_ENV_NAME: MLC_DATASET_CNNDM_EVAL_PATH
validation,edge,llama3,mlc,r2_downloader:
adr:
dae:
extra_cache_tags: cnndm,dataset,llama3,val,edge
env:
MLC_DATASET_CNNDM_FILENAME: sample_cnn_eval_5000.json
MLC_DOWNLOAD_URL: https://inference.mlcommons-storage.org/metadata/llama3-1-8b-sample-cnn-eval-5000.uri
MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_DATASET_CNNDM_EVAL_PATH
MLC_EXTRACT_FINAL_ENV_NAME: MLC_DATASET_CNNDM_EVAL_PATH
validation,datacenter,llama3,mlc,rclone:
adr:
dae:
Expand All @@ -135,6 +153,15 @@ variations:
MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/<<<MLC_DATASET_CNNDM_FILENAME>>>
MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_DATASET_CNNDM_EVAL_PATH
MLC_EXTRACT_FINAL_ENV_NAME: MLC_DATASET_CNNDM_EVAL_PATH
validation,datacenter,llama3,mlc,r2_downlaoder:
adr:
dae:
extra_cache_tags: cnndm,dataset,llama3,val,datacenter
env:
MLC_DATASET_CNNDM_FILENAME: cnn_eval.json
MLC_DOWNLOAD_URL: https://inference.mlcommons-storage.org/metadata/llama3-1-8b-cnn-eval.uri
MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_DATASET_CNNDM_EVAL_PATH
MLC_EXTRACT_FINAL_ENV_NAME: MLC_DATASET_CNNDM_EVAL_PATH
calibation,llama3,mlc,rclone:
adr:
dae:
Expand All @@ -144,4 +171,21 @@ variations:
MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/<<<MLC_DATASET_CNNDM_FILENAME>>>
MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_CALIBRATION_DATASET_CNNDM_PATH
MLC_EXTRACT_FINAL_ENV_NAME: MLC_CALIBRATION_DATASET_CNNDM_PATH

calibation,llama3,mlc,r2_downloader:
adr:
dae:
extra_cache_tags: cnndm,dataset,llama3,calib
env:
MLC_DATASET_CNNDM_FILENAME: cnn_dailymail_calibration.json
MLC_DOWNLOAD_URL: https://inference.mlcommons-storage.org/metadata/llama3-1-8b-cnn-dailymail-calibration.uri
MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_CALIBRATION_DATASET_CNNDM_PATH
MLC_EXTRACT_FINAL_ENV_NAME: MLC_CALIBRATION_DATASET_CNNDM_PATH
tests:
run_inputs:
- variations_list:
# - validation,edge,rclone,llama3,mlc,dry-run
# - validation,datacenter,rclone,llama3,mlc,dry-run
- validation,edge,r2_downloader,llama3,mlc,dry-run
- validation,datacenter,r2_downloader,llama3,mlc,dry-run
- calibration,r2_downloader,llama3,mlc,dry-run
# - calibration,rclone,llama3,mlc,dry-run
51 changes: 26 additions & 25 deletions script/get-dataset-whisper/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,30 +42,31 @@ def postprocess(i):

env = i['env']

if env.get('MLC_TMP_DATASET_TYPE', '') != "preprocessed":
cwd = env.get('MLC_OUTDIRNAME', os.getcwd())
data_dir = os.path.join(cwd, 'data')
env['MLC_DATASET_WHISPER_PATH'] = data_dir
else:
# copy files to data folder
tmp_src_dir = env["MLC_DATASET_WHISPER_PATH"]
tmp_dest_dir = os.path.join(tmp_src_dir, "data")

os.makedirs(tmp_dest_dir, exist_ok=True)

items_to_copy = [
"LibriSpeech",
"dev-all",
"dev-all-repack",
"dev-all-repack.json"
]

for item in items_to_copy:
src_path = os.path.join(tmp_src_dir, item)
dst_path = os.path.join(tmp_dest_dir, item)
if os.path.isdir(src_path):
shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
elif os.path.isfile(src_path):
shutil.copy2(src_path, dst_path)
if env.get('MLC_DOWNLOAD_MODE', '') != "dry":
if env.get('MLC_TMP_DATASET_TYPE', '') != "preprocessed":
cwd = env.get('MLC_OUTDIRNAME', os.getcwd())
data_dir = os.path.join(cwd, 'data')
env['MLC_DATASET_WHISPER_PATH'] = data_dir
else:
# copy files to data folder
tmp_src_dir = env["MLC_DATASET_WHISPER_PATH"]
tmp_dest_dir = os.path.join(tmp_src_dir, "data")

os.makedirs(tmp_dest_dir, exist_ok=True)

items_to_copy = [
"LibriSpeech",
"dev-all",
"dev-all-repack",
"dev-all-repack.json"
]

for item in items_to_copy:
src_path = os.path.join(tmp_src_dir, item)
dst_path = os.path.join(tmp_dest_dir, item)
if os.path.isdir(src_path):
shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
elif os.path.isfile(src_path):
shutil.copy2(src_path, dst_path)

return {'return': 0}
38 changes: 27 additions & 11 deletions script/get-dataset-whisper/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,15 @@ tags:
- dataset
- whisper
uid: 2cc955c795d44978
tests:
run_inputs:
- variations_list:
- rclone,preprocessed,mlc,dry-run
- r2_downloader,preprocessed,mlc,dry-run
variations:
preprocessed:
group: dataset-type
default: true
base:
- mlc
env:
MLC_TMP_DATASET_TYPE: preprocessed
unprocessed:
Expand All @@ -42,13 +45,28 @@ variations:
dry-run,rclone:
env:
MLC_DOWNLOAD_EXTRA_OPTIONS: --dry-run
dry-run,r2_downloader:
env:
MLC_DOWNLOAD_EXTRA_OPTIONS: -x
mlc:
default: true
base:
- rclone
env:
MLC_DOWNLOAD_SRC: mlcommons
group: download-src
rclone:
add_deps_recursive:
dae:
tags: _rclone
default: true
group: download-tool
r2_downloader:
add_deps_recursive:
dae:
tags: _r2_downloader
group: download-tool
rclone,preprocessed:
env:
MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/Whisper/dataset/
prehook_deps:
- enable_if_env:
MLC_TMP_REQUIRE_DOWNLOAD:
Expand All @@ -59,12 +77,13 @@ variations:
- true
force_cache: true
tags: get,rclone-config,_mlc-inference
mlc,preprocessed:
prehook_deps:
- enable_if_env:
MLC_TMP_REQUIRE_DOWNLOAD:
- 'yes'
env:
MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_DATASET_WHISPER_PATH
MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/Whisper/dataset/
MLC_EXTRACT_FINAL_ENV_NAME: MLC_DATASET_WHISPER_PATH
extra_cache_tags: whisper,dataset
force_cache: true
Expand All @@ -76,9 +95,6 @@ variations:
update_tags_from_env_with_prefix:
_url.:
- MLC_DOWNLOAD_URL
rclone:
add_deps_recursive:
dae:
tags: _rclone
default: true
group: download-tool
r2_downloader,preprocessed:
env:
MLC_DOWNLOAD_URL: https://inference.mlcommons-storage.org/metadata/whisper-dataset.uri
Loading
Loading