From 9118b3e95bcfcf3ffd7cad57b1145a80d844fcc1 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Fri, 4 Jul 2025 08:51:54 +0530 Subject: [PATCH 01/21] initial commit for r2 downloader --- script/download-file/customize.py | 13 +++++++++++ script/download-file/meta.yaml | 5 ++++ script/get-ml-model-whisper/meta.yaml | 33 +++++++++++++++++---------- 3 files changed, 39 insertions(+), 12 deletions(-) diff --git a/script/download-file/customize.py b/script/download-file/customize.py index 5c70f7931..14c047c23 100644 --- a/script/download-file/customize.py +++ b/script/download-file/customize.py @@ -203,6 +203,19 @@ def preprocess(i): env['MLC_DOWNLOAD_CMD'] += f" || (({del_cmd} {env['MLC_DOWNLOAD_FILENAME']} || true) && wget -nc {extra_download_options} {url})" logger.info(f"{env['MLC_DOWNLOAD_CMD']}") + elif tool == "r2_downloader": + env['MLC_DOWNLOAD_CMD'] = f"bash <(curl -s https://raw.githubusercontent.com/mlcommons/r2-downloader/refs/heads/main/mlc-r2-downloader.sh) " + env['MLC_DOWNLOAD_CMD'] += f" {url} {env['MLC_DOWNLOAD_FILENAME']}" + if env["MLC_HOST_OS_TYPE"] == "windows": + # have to modify the variable from url to temp_url if it is + # going to be used anywhere after this point + url = url.replace("%", "%%") + temp_download_file = env['MLC_DOWNLOAD_FILENAME'].replace( + "%", "%%") + else: + temp_download_file = env['MLC_DOWNLOAD_FILENAME'] + env['MLC_DOWNLOAD_CMD'] += f" -d {q}{os.path.join(os.getcwd(), temp_download_file)}{q} {extra_download_options}" + elif tool == "curl": if env.get('MLC_DOWNLOAD_FILENAME', '') != '': extra_download_options += f" --output {q}{env['MLC_DOWNLOAD_FILENAME']}{q} " diff --git a/script/download-file/meta.yaml b/script/download-file/meta.yaml index 92d11e430..e195b078f 100644 --- a/script/download-file/meta.yaml +++ b/script/download-file/meta.yaml @@ -77,4 +77,9 @@ variations: env: MLC_DOWNLOAD_TOOL: wget group: download-tool + r2_downloader: + env: + MLC_DOWNLOAD_TOOL: r2_downloader + group: download-tool + versions: {} diff --git a/script/get-ml-model-whisper/meta.yaml b/script/get-ml-model-whisper/meta.yaml index 5442b7e8d..935088624 100644 --- a/script/get-ml-model-whisper/meta.yaml +++ b/script/get-ml-model-whisper/meta.yaml @@ -31,24 +31,12 @@ variations: MLC_DOWNLOAD_SRC: mlcommons group: download-src prehook_deps: - - enable_if_env: - MLC_TMP_REQUIRE_DOWNLOAD: - - true - tags: get,rclone - - enable_if_env: - MLC_TMP_REQUIRE_DOWNLOAD: - - true - env: - MLC_RCLONE_DRIVE_FOLDER_ID: 17CpM5eU8tjrxh_LpH_BTNTeT37PhzcnC - force_cache: true - tags: get,rclone-config,_mlc-inference - enable_if_env: MLC_TMP_REQUIRE_DOWNLOAD: - 'yes' env: MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_ML_MODEL_WHISPER_PATH MLC_EXTRACT_FINAL_ENV_NAME: MLC_ML_MODEL_WHISPER_PATH - MLC_DOWNLOAD_URL: 'mlc-inference:mlcommons-inference-wg-public/Whisper/model/' extra_cache_tags: ml,model,whisper force_cache: true force_env_keys: @@ -63,5 +51,26 @@ variations: add_deps_recursive: dae: tags: _rclone + env: + MLC_DOWNLOAD_URL: 'mlc-inference:mlcommons-inference-wg-public/Whisper/model/' + prehook_deps: + - enable_if_env: + MLC_TMP_REQUIRE_DOWNLOAD: + - true + tags: get,rclone + - enable_if_env: + MLC_TMP_REQUIRE_DOWNLOAD: + - true + env: + MLC_RCLONE_DRIVE_FOLDER_ID: 17CpM5eU8tjrxh_LpH_BTNTeT37PhzcnC + force_cache: true + tags: get,rclone-config,_mlc-inference default: true group: download-tool + r2_downloader: + add_deps_recursive: + dae: + tags: _r2_downloader + env: + MLC_DOWNLOAD_URL: 'https://inference.mlcommons-storage.org/metadata/whisper-model.uri' + group: download-tool \ No newline at end of file From 5165771ce4c2a3fc116e8147b570f2c062a13d81 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Fri, 4 Jul 2025 08:55:49 +0530 Subject: [PATCH 02/21] fixes --- script/download-and-extract/meta.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/script/download-and-extract/meta.yaml b/script/download-and-extract/meta.yaml index 96ff8c6a8..02aaf1175 100644 --- a/script/download-and-extract/meta.yaml +++ b/script/download-and-extract/meta.yaml @@ -125,4 +125,9 @@ variations: download-script: tags: _wget group: download-tool + r2_downloader: + add_deps: + download-script: + tags: _r2_downloader + group: download-tool versions: {} From dbbd5a6700284a167d4479d80d7eb3a0972ca3f2 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Fri, 4 Jul 2025 08:58:28 +0530 Subject: [PATCH 03/21] add dry run for r2 --- script/get-ml-model-whisper/meta.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/script/get-ml-model-whisper/meta.yaml b/script/get-ml-model-whisper/meta.yaml index 935088624..7a4d5a480 100644 --- a/script/get-ml-model-whisper/meta.yaml +++ b/script/get-ml-model-whisper/meta.yaml @@ -25,6 +25,9 @@ variations: dry-run,rclone: env: MLC_DOWNLOAD_EXTRA_OPTIONS: --dry-run + dry-run,r2_downloader: + env: + MLC_DOWNLOAD_EXTRA_OPTIONS: -t mlc: default: true env: From 4989779efa68c18349cc2b768be71dae2948f290 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Fri, 4 Jul 2025 13:14:43 +0530 Subject: [PATCH 04/21] add support for r2 downloader --- .../meta.yaml | 34 ++++++++++++++----- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/script/get-preprocessed-dataset-mlperf-deepseek-r1/meta.yaml b/script/get-preprocessed-dataset-mlperf-deepseek-r1/meta.yaml index 4cb2ff28d..3adcb0fd9 100644 --- a/script/get-preprocessed-dataset-mlperf-deepseek-r1/meta.yaml +++ b/script/get-preprocessed-dataset-mlperf-deepseek-r1/meta.yaml @@ -25,25 +25,26 @@ variations: group: dataset-type env: MLC_PREPROCESSED_DATASET_TYPE: validation + validation,rclone: + env: MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/deepseek_r1/mlperf_deepseek_r1_dataset_4388_fp8_eval.pkl + validation,r2_downloader: + env: + MLC_DOWNLOAD_URL: "" calibration: group: dataset-type env: MLC_PREPROCESSED_DATASET_TYPE: calibration + calibration,rclone: + env: MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/deepseek_r1/mlperf_deepseek_r1_calibration_dataset_500_fp8_eval.pkl + calibration,r2_downloader: + env: + MLC_DOWNLOAD_URL: "" mlc: group: download-src default: true prehook_deps: - - tags: get,rclone - enable_if_env: - MLC_TMP_REQUIRE_DOWNLOAD: - - yes - - tags: get,rclone-config,_mlc-inference - force_cache: true - enable_if_env: - MLC_TMP_REQUIRE_DOWNLOAD: - - yes - enable_if_env: MLC_TMP_REQUIRE_DOWNLOAD: - 'yes' @@ -62,11 +63,26 @@ variations: - MLC_DOWNLOAD_URL env: MLC_DOWNLOAD_SRC: mlcommons + r2_downloader: + group: download-tool + add_deps_recursive: + dae: + tags: _r2_downloader rclone: group: download-tool add_deps_recursive: dae: tags: _rclone + prehook_deps: + - tags: get,rclone + enable_if_env: + MLC_TMP_REQUIRE_DOWNLOAD: + - yes + - tags: get,rclone-config,_mlc-inference + force_cache: true + enable_if_env: + MLC_TMP_REQUIRE_DOWNLOAD: + - yes default: true dry-run: group: run-mode From 7b80efd4698b2754bdd6d3c7e6ecca67d263909d Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Mon, 7 Jul 2025 11:58:02 +0530 Subject: [PATCH 05/21] add r2 downloader for dataset whisper --- script/download-file/customize.py | 2 +- script/get-dataset-whisper/meta.yaml | 21 ++++++++++++--------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/script/download-file/customize.py b/script/download-file/customize.py index 14c047c23..35ba1a49c 100644 --- a/script/download-file/customize.py +++ b/script/download-file/customize.py @@ -205,7 +205,7 @@ def preprocess(i): elif tool == "r2_downloader": env['MLC_DOWNLOAD_CMD'] = f"bash <(curl -s https://raw.githubusercontent.com/mlcommons/r2-downloader/refs/heads/main/mlc-r2-downloader.sh) " - env['MLC_DOWNLOAD_CMD'] += f" {url} {env['MLC_DOWNLOAD_FILENAME']}" + env['MLC_DOWNLOAD_CMD'] += f" {url} " if env["MLC_HOST_OS_TYPE"] == "windows": # have to modify the variable from url to temp_url if it is # going to be used anywhere after this point diff --git a/script/get-dataset-whisper/meta.yaml b/script/get-dataset-whisper/meta.yaml index 6b433c658..6bfeee4af 100644 --- a/script/get-dataset-whisper/meta.yaml +++ b/script/get-dataset-whisper/meta.yaml @@ -49,16 +49,8 @@ variations: env: MLC_DOWNLOAD_SRC: mlcommons group: download-src + mlc,rclone,preprocessed: prehook_deps: - - enable_if_env: - MLC_TMP_REQUIRE_DOWNLOAD: - - true - tags: get,rclone - - enable_if_env: - MLC_TMP_REQUIRE_DOWNLOAD: - - true - force_cache: true - tags: get,rclone-config,_mlc-inference - enable_if_env: MLC_TMP_REQUIRE_DOWNLOAD: - 'yes' @@ -76,6 +68,17 @@ variations: update_tags_from_env_with_prefix: _url.: - MLC_DOWNLOAD_URL + rclone,preprocessed: + prehook_deps: + - enable_if_env: + MLC_TMP_REQUIRE_DOWNLOAD: + - true + tags: get,rclone + - enable_if_env: + MLC_TMP_REQUIRE_DOWNLOAD: + - true + force_cache: true + tags: get,rclone-config,_mlc-inference rclone: add_deps_recursive: dae: From d791f3dcd271198c9f886fb4c03f4b881a1f09ba Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 7 Jul 2025 06:28:22 +0000 Subject: [PATCH 06/21] [Automated Commit] Format Codebase [skip ci] --- script/download-file/customize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/download-file/customize.py b/script/download-file/customize.py index 35ba1a49c..6dba21096 100644 --- a/script/download-file/customize.py +++ b/script/download-file/customize.py @@ -215,7 +215,7 @@ def preprocess(i): else: temp_download_file = env['MLC_DOWNLOAD_FILENAME'] env['MLC_DOWNLOAD_CMD'] += f" -d {q}{os.path.join(os.getcwd(), temp_download_file)}{q} {extra_download_options}" - + elif tool == "curl": if env.get('MLC_DOWNLOAD_FILENAME', '') != '': extra_download_options += f" --output {q}{env['MLC_DOWNLOAD_FILENAME']}{q} " From 1557d7c250821adc506b75a2440a0805e2d8ba90 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Tue, 8 Jul 2025 08:27:51 +0530 Subject: [PATCH 07/21] fixes for r2_downloader --- script/download-file/customize.py | 3 +-- script/get-dataset-whisper/meta.yaml | 18 +++++++++++++++--- script/get-ml-model-whisper/meta.yaml | 3 ++- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/script/download-file/customize.py b/script/download-file/customize.py index 35ba1a49c..19e729839 100644 --- a/script/download-file/customize.py +++ b/script/download-file/customize.py @@ -205,7 +205,6 @@ def preprocess(i): elif tool == "r2_downloader": env['MLC_DOWNLOAD_CMD'] = f"bash <(curl -s https://raw.githubusercontent.com/mlcommons/r2-downloader/refs/heads/main/mlc-r2-downloader.sh) " - env['MLC_DOWNLOAD_CMD'] += f" {url} " if env["MLC_HOST_OS_TYPE"] == "windows": # have to modify the variable from url to temp_url if it is # going to be used anywhere after this point @@ -214,7 +213,7 @@ def preprocess(i): "%", "%%") else: temp_download_file = env['MLC_DOWNLOAD_FILENAME'] - env['MLC_DOWNLOAD_CMD'] += f" -d {q}{os.path.join(os.getcwd(), temp_download_file)}{q} {extra_download_options}" + env['MLC_DOWNLOAD_CMD'] += f" -d {q}{os.path.join(os.getcwd(), temp_download_file)}{q} {extra_download_options} {url}" elif tool == "curl": if env.get('MLC_DOWNLOAD_FILENAME', '') != '': diff --git a/script/get-dataset-whisper/meta.yaml b/script/get-dataset-whisper/meta.yaml index 6bfeee4af..74899732a 100644 --- a/script/get-dataset-whisper/meta.yaml +++ b/script/get-dataset-whisper/meta.yaml @@ -42,6 +42,9 @@ variations: dry-run,rclone: env: MLC_DOWNLOAD_EXTRA_OPTIONS: --dry-run + dry-run,r2_downloader: + env: + MLC_DOWNLOAD_EXTRA_OPTIONS: -x mlc: default: true base: @@ -49,14 +52,13 @@ variations: env: MLC_DOWNLOAD_SRC: mlcommons group: download-src - mlc,rclone,preprocessed: + mlc,preprocessed: prehook_deps: - enable_if_env: MLC_TMP_REQUIRE_DOWNLOAD: - 'yes' env: - MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_DATASET_WHISPER_PATH - MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/Whisper/dataset/ + MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_DATASET_WHISPER_PAT MLC_EXTRACT_FINAL_ENV_NAME: MLC_DATASET_WHISPER_PATH extra_cache_tags: whisper,dataset force_cache: true @@ -69,6 +71,8 @@ variations: _url.: - MLC_DOWNLOAD_URL rclone,preprocessed: + env: + MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/Whisper/dataset/ prehook_deps: - enable_if_env: MLC_TMP_REQUIRE_DOWNLOAD: @@ -79,9 +83,17 @@ variations: - true force_cache: true tags: get,rclone-config,_mlc-inference + r2_downloader,preprocessed: + env: + MLC_DOWNLOAD_URL: https://inference.mlcommons-storage.org/metadata/whisper-dataset.uri rclone: add_deps_recursive: dae: tags: _rclone default: true group: download-tool + r2_downloader: + add_deps_recursive: + dae: + tags: _r2_downloader + group: download-tool diff --git a/script/get-ml-model-whisper/meta.yaml b/script/get-ml-model-whisper/meta.yaml index 7a4d5a480..bd7f4eaef 100644 --- a/script/get-ml-model-whisper/meta.yaml +++ b/script/get-ml-model-whisper/meta.yaml @@ -16,6 +16,7 @@ tests: run_inputs: - variations_list: - rclone,mlc,dry-run + - r2_downloader,mlc,dry-run uid: 3bea2356e97f47b1 variations: dry-run: @@ -27,7 +28,7 @@ variations: MLC_DOWNLOAD_EXTRA_OPTIONS: --dry-run dry-run,r2_downloader: env: - MLC_DOWNLOAD_EXTRA_OPTIONS: -t + MLC_DOWNLOAD_EXTRA_OPTIONS: -x mlc: default: true env: From 59e8cd5fa7e2b7d739b5f8875822dba1921ea1db Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 8 Jul 2025 02:59:53 +0000 Subject: [PATCH 08/21] [Automated Commit] Format Codebase [skip ci] --- script/download-file/customize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/download-file/customize.py b/script/download-file/customize.py index 19e729839..3e0665c79 100644 --- a/script/download-file/customize.py +++ b/script/download-file/customize.py @@ -214,7 +214,7 @@ def preprocess(i): else: temp_download_file = env['MLC_DOWNLOAD_FILENAME'] env['MLC_DOWNLOAD_CMD'] += f" -d {q}{os.path.join(os.getcwd(), temp_download_file)}{q} {extra_download_options} {url}" - + elif tool == "curl": if env.get('MLC_DOWNLOAD_FILENAME', '') != '': extra_download_options += f" --output {q}{env['MLC_DOWNLOAD_FILENAME']}{q} " From 5323939fa0760b1b55dce15aef93bee7f9f53787 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Tue, 8 Jul 2025 09:10:42 +0530 Subject: [PATCH 09/21] fix rclone being included multiple times --- script/get-dataset-whisper/meta.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/script/get-dataset-whisper/meta.yaml b/script/get-dataset-whisper/meta.yaml index 74899732a..1652c2e81 100644 --- a/script/get-dataset-whisper/meta.yaml +++ b/script/get-dataset-whisper/meta.yaml @@ -14,8 +14,6 @@ variations: preprocessed: group: dataset-type default: true - base: - - mlc env: MLC_TMP_DATASET_TYPE: preprocessed unprocessed: @@ -47,8 +45,6 @@ variations: MLC_DOWNLOAD_EXTRA_OPTIONS: -x mlc: default: true - base: - - rclone env: MLC_DOWNLOAD_SRC: mlcommons group: download-src From 2e0eb207f65729ae74b9ace69748eea4f6b6637d Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Tue, 8 Jul 2025 10:07:57 +0530 Subject: [PATCH 10/21] add support for r2 download --- script/get-ml-model-llama3/meta.yaml | 45 +++++++++++++++++++++------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/script/get-ml-model-llama3/meta.yaml b/script/get-ml-model-llama3/meta.yaml index 4752cf5d3..db7274afb 100644 --- a/script/get-ml-model-llama3/meta.yaml +++ b/script/get-ml-model-llama3/meta.yaml @@ -46,31 +46,22 @@ variations: default: true env: MLC_ML_MODEL_NAME: Llama-3.1-405B-Instruct + MLC_ML_MODEL_R2_HOSTED_NAME: llama3-1-405b-instruct 8b: group: model-size env: MLC_ML_MODEL_NAME: Llama-3.1-8b-Instruct + MLC_ML_MODEL_R2_HOSTED_NAME: llama3-1-8b-instruct mlc: group: download-src default: true prehook_deps: - - tags: get,rclone - enable_if_env: - MLC_TMP_REQUIRE_DOWNLOAD: - - yes - - tags: get,rclone-config,_mlperf-llama3-1 - force_cache: true - enable_if_env: - MLC_TMP_REQUIRE_DOWNLOAD: - - yes - enable_if_env: MLC_TMP_REQUIRE_DOWNLOAD: - 'yes' env: MLC_DOWNLOAD_FINAL_ENV_NAME: LLAMA3_CHECKPOINT_PATH MLC_EXTRACT_FINAL_ENV_NAME: LLAMA3_CHECKPOINT_PATH - MLC_DOWNLOAD_URL: mlc-llama3-1:inference/<<>> - extra_cache_tags: llama3,dataset force_cache: true names: - dae @@ -82,12 +73,39 @@ variations: - MLC_DOWNLOAD_URL env: MLC_DOWNLOAD_SRC: mlcommons + mlc,rclone: + env: + MLC_DOWNLOAD_URL: mlc-llama3-1:inference/<<>> + adr: + dae: + extra_cache_tags: llama3,dataset,rclone + mlc,r2_downloader: + env: + MLC_DOWNLOAD_URL: https://llama3-1.mlcommons-storage.org/metadata/<<>>.uri + adr: + dae: + extra_cache_tags: llama3,dataset,rclone rclone: group: download-tool add_deps_recursive: dae: tags: _rclone + prehook_deps: + - tags: get,rclone + enable_if_env: + MLC_TMP_REQUIRE_DOWNLOAD: + - yes + - tags: get,rclone-config,_mlperf-llama3-1 + force_cache: true + enable_if_env: + MLC_TMP_REQUIRE_DOWNLOAD: + - yes default: true + r2_downloader: + group: download-tool + add_deps_recursive: + dae: + tags: _r2_downloader dry-run: group: run-mode env: @@ -95,6 +113,9 @@ variations: dry-run,rclone: env: MLC_DOWNLOAD_EXTRA_OPTIONS: --dry-run + dry-run,r2_downloader: + env: + MLC_DOWNLOAD_EXTRA_OPTIONS: -x hf: group: download-src default_variations: @@ -110,6 +131,7 @@ variations: tags: _model-stub.meta-llama/Llama-3.1-405B-Instruct env: MLC_ML_MODEL_NAME: Llama-3.1-405B-Instruct + MLC_ML_MODEL_R2_HOSTED_NAME: llama3-1-405b-instruct MLC_MODEL_ZOO_ENV_KEY: LLAMA3 group: huggingface-stub @@ -121,6 +143,7 @@ variations: tags: _model-stub.meta-llama/Llama-3.1-8B-Instruct env: MLC_ML_MODEL_NAME: Llama-3.1-8b-Instruct + MLC_ML_MODEL_R2_HOSTED_NAME: llama3-1-8b-instruct MLC_MODEL_ZOO_ENV_KEY: LLAMA3 group: huggingface-stub From d2b64028afa810a497a80b2091678a2c86e7e061 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Tue, 8 Jul 2025 10:33:58 +0530 Subject: [PATCH 11/21] fix typo --- script/get-dataset-whisper/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/get-dataset-whisper/meta.yaml b/script/get-dataset-whisper/meta.yaml index 1652c2e81..37756aed9 100644 --- a/script/get-dataset-whisper/meta.yaml +++ b/script/get-dataset-whisper/meta.yaml @@ -54,7 +54,7 @@ variations: MLC_TMP_REQUIRE_DOWNLOAD: - 'yes' env: - MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_DATASET_WHISPER_PAT + MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_DATASET_WHISPER_PATH MLC_EXTRACT_FINAL_ENV_NAME: MLC_DATASET_WHISPER_PATH extra_cache_tags: whisper,dataset force_cache: true From 36c7d3a409f20ea338435c9ba8426a45d50e0fb6 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Tue, 8 Jul 2025 10:50:27 +0530 Subject: [PATCH 12/21] Add tests for llama3 model --- script/get-dataset-whisper/meta.yaml | 5 +++++ script/get-ml-model-llama3/meta.yaml | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/script/get-dataset-whisper/meta.yaml b/script/get-dataset-whisper/meta.yaml index 37756aed9..8f11f1583 100644 --- a/script/get-dataset-whisper/meta.yaml +++ b/script/get-dataset-whisper/meta.yaml @@ -10,6 +10,11 @@ tags: - dataset - whisper uid: 2cc955c795d44978 +tests: + run_inputs: + - variations_list: + - rclone,preprocessed,mlc,dry-run + - r2_downloader,preprocessed,mlc,dry-run variations: preprocessed: group: dataset-type diff --git a/script/get-ml-model-llama3/meta.yaml b/script/get-ml-model-llama3/meta.yaml index db7274afb..8e5f80b3d 100644 --- a/script/get-ml-model-llama3/meta.yaml +++ b/script/get-ml-model-llama3/meta.yaml @@ -33,6 +33,12 @@ tags: - llama3 - llama3-405b uid: 2f8cef2acc334e80 +tests: + run_inputs: + - variations_list: + - rclone,405b,mlc,dry-run + - r2_downloader,405b,mlc,dry-run + - r2_downloader,8b,mlc,dry-run variations: fp16: default: true From 5f475f2e6fa17ae8529d534356e084bf25aa0c70 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Wed, 9 Jul 2025 12:33:35 +0530 Subject: [PATCH 13/21] set needs pat to true --- script/get-ml-model-llama3/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/script/get-ml-model-llama3/meta.yaml b/script/get-ml-model-llama3/meta.yaml index 8e5f80b3d..fd79fdc33 100644 --- a/script/get-ml-model-llama3/meta.yaml +++ b/script/get-ml-model-llama3/meta.yaml @@ -34,6 +34,7 @@ tags: - llama3-405b uid: 2f8cef2acc334e80 tests: + needs_pat: true run_inputs: - variations_list: - rclone,405b,mlc,dry-run From 328cee07a8ede22d0c7bea393bde4f29bfc8c667 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Wed, 9 Jul 2025 12:53:53 +0530 Subject: [PATCH 14/21] Update meta.yaml --- script/get-dataset-whisper/meta.yaml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/script/get-dataset-whisper/meta.yaml b/script/get-dataset-whisper/meta.yaml index 8f11f1583..3dcc0e717 100644 --- a/script/get-dataset-whisper/meta.yaml +++ b/script/get-dataset-whisper/meta.yaml @@ -53,6 +53,17 @@ variations: env: MLC_DOWNLOAD_SRC: mlcommons group: download-src + rclone: + add_deps_recursive: + dae: + tags: _rclone + default: true + group: download-tool + r2_downloader: + add_deps_recursive: + dae: + tags: _r2_downloader + group: download-tool mlc,preprocessed: prehook_deps: - enable_if_env: @@ -87,14 +98,3 @@ variations: r2_downloader,preprocessed: env: MLC_DOWNLOAD_URL: https://inference.mlcommons-storage.org/metadata/whisper-dataset.uri - rclone: - add_deps_recursive: - dae: - tags: _rclone - default: true - group: download-tool - r2_downloader: - add_deps_recursive: - dae: - tags: _r2_downloader - group: download-tool From 228dcac089d79c03b5df3a876cf4ca4b9425a49b Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Wed, 9 Jul 2025 12:58:18 +0530 Subject: [PATCH 15/21] Update meta.yaml --- script/get-dataset-whisper/meta.yaml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/script/get-dataset-whisper/meta.yaml b/script/get-dataset-whisper/meta.yaml index 3dcc0e717..12fddf9e4 100644 --- a/script/get-dataset-whisper/meta.yaml +++ b/script/get-dataset-whisper/meta.yaml @@ -64,6 +64,19 @@ variations: dae: tags: _r2_downloader group: download-tool + rclone,preprocessed: + env: + MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/Whisper/dataset/ + prehook_deps: + - enable_if_env: + MLC_TMP_REQUIRE_DOWNLOAD: + - true + tags: get,rclone + - enable_if_env: + MLC_TMP_REQUIRE_DOWNLOAD: + - true + force_cache: true + tags: get,rclone-config,_mlc-inference mlc,preprocessed: prehook_deps: - enable_if_env: @@ -82,19 +95,6 @@ variations: update_tags_from_env_with_prefix: _url.: - MLC_DOWNLOAD_URL - rclone,preprocessed: - env: - MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/Whisper/dataset/ - prehook_deps: - - enable_if_env: - MLC_TMP_REQUIRE_DOWNLOAD: - - true - tags: get,rclone - - enable_if_env: - MLC_TMP_REQUIRE_DOWNLOAD: - - true - force_cache: true - tags: get,rclone-config,_mlc-inference r2_downloader,preprocessed: env: MLC_DOWNLOAD_URL: https://inference.mlcommons-storage.org/metadata/whisper-dataset.uri From cf5efb3556e058bdd25b1bbb588260ada24bd62f Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Wed, 9 Jul 2025 13:02:09 +0530 Subject: [PATCH 16/21] handle dry run --- script/get-dataset-whisper/customize.py | 51 +++++++++++++------------ 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/script/get-dataset-whisper/customize.py b/script/get-dataset-whisper/customize.py index b2ae96020..373068eee 100644 --- a/script/get-dataset-whisper/customize.py +++ b/script/get-dataset-whisper/customize.py @@ -42,30 +42,31 @@ def postprocess(i): env = i['env'] - if env.get('MLC_TMP_DATASET_TYPE', '') != "preprocessed": - cwd = env.get('MLC_OUTDIRNAME', os.getcwd()) - data_dir = os.path.join(cwd, 'data') - env['MLC_DATASET_WHISPER_PATH'] = data_dir - else: - # copy files to data folder - tmp_src_dir = env["MLC_DATASET_WHISPER_PATH"] - tmp_dest_dir = os.path.join(tmp_src_dir, "data") - - os.makedirs(tmp_dest_dir, exist_ok=True) - - items_to_copy = [ - "LibriSpeech", - "dev-all", - "dev-all-repack", - "dev-all-repack.json" - ] - - for item in items_to_copy: - src_path = os.path.join(tmp_src_dir, item) - dst_path = os.path.join(tmp_dest_dir, item) - if os.path.isdir(src_path): - shutil.copytree(src_path, dst_path, dirs_exist_ok=True) - elif os.path.isfile(src_path): - shutil.copy2(src_path, dst_path) + if env.get('MLC_DOWNLOAD_MODE', '') != "dry": + if env.get('MLC_TMP_DATASET_TYPE', '') != "preprocessed": + cwd = env.get('MLC_OUTDIRNAME', os.getcwd()) + data_dir = os.path.join(cwd, 'data') + env['MLC_DATASET_WHISPER_PATH'] = data_dir + else: + # copy files to data folder + tmp_src_dir = env["MLC_DATASET_WHISPER_PATH"] + tmp_dest_dir = os.path.join(tmp_src_dir, "data") + + os.makedirs(tmp_dest_dir, exist_ok=True) + + items_to_copy = [ + "LibriSpeech", + "dev-all", + "dev-all-repack", + "dev-all-repack.json" + ] + + for item in items_to_copy: + src_path = os.path.join(tmp_src_dir, item) + dst_path = os.path.join(tmp_dest_dir, item) + if os.path.isdir(src_path): + shutil.copytree(src_path, dst_path, dirs_exist_ok=True) + elif os.path.isfile(src_path): + shutil.copy2(src_path, dst_path) return {'return': 0} From 8244a8f8b5437f001c60db0ecf62bd93a05dfaf0 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Wed, 9 Jul 2025 13:04:43 +0530 Subject: [PATCH 17/21] Update meta.yaml --- .../meta.yaml | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/script/get-preprocessed-dataset-mlperf-deepseek-r1/meta.yaml b/script/get-preprocessed-dataset-mlperf-deepseek-r1/meta.yaml index 3adcb0fd9..646fd08ec 100644 --- a/script/get-preprocessed-dataset-mlperf-deepseek-r1/meta.yaml +++ b/script/get-preprocessed-dataset-mlperf-deepseek-r1/meta.yaml @@ -35,34 +35,6 @@ variations: group: dataset-type env: MLC_PREPROCESSED_DATASET_TYPE: calibration - calibration,rclone: - env: - MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/deepseek_r1/mlperf_deepseek_r1_calibration_dataset_500_fp8_eval.pkl - calibration,r2_downloader: - env: - MLC_DOWNLOAD_URL: "" - mlc: - group: download-src - default: true - prehook_deps: - - enable_if_env: - MLC_TMP_REQUIRE_DOWNLOAD: - - 'yes' - env: - MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_PREPROCESSED_DATASET_DEEPSEEK_R1_PATH - MLC_EXTRACT_FINAL_ENV_NAME: MLC_PREPROCESSED_DATASET_DEEPSEEK_R1_PATH - extra_cache_tags: deepseek-r1,dataset - force_cache: true - names: - - dae - tags: download-and-extract - force_env_keys: - - MLC_OUTDIRNAME - update_tags_from_env_with_prefix: - _url.: - - MLC_DOWNLOAD_URL - env: - MLC_DOWNLOAD_SRC: mlcommons r2_downloader: group: download-tool add_deps_recursive: @@ -91,6 +63,34 @@ variations: dry-run,rclone: env: MLC_DOWNLOAD_EXTRA_OPTIONS: --dry-run + calibration,rclone: + env: + MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/deepseek_r1/mlperf_deepseek_r1_calibration_dataset_500_fp8_eval.pkl + calibration,r2_downloader: + env: + MLC_DOWNLOAD_URL: "" + mlc: + group: download-src + default: true + prehook_deps: + - enable_if_env: + MLC_TMP_REQUIRE_DOWNLOAD: + - 'yes' + env: + MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_PREPROCESSED_DATASET_DEEPSEEK_R1_PATH + MLC_EXTRACT_FINAL_ENV_NAME: MLC_PREPROCESSED_DATASET_DEEPSEEK_R1_PATH + extra_cache_tags: deepseek-r1,dataset + force_cache: true + names: + - dae + tags: download-and-extract + force_env_keys: + - MLC_OUTDIRNAME + update_tags_from_env_with_prefix: + _url.: + - MLC_DOWNLOAD_URL + env: + MLC_DOWNLOAD_SRC: mlcommons tests: run_inputs: - variations_list: From f153c6269ec1c69c2e3b7b29ff3e745ff889f24a Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Thu, 10 Jul 2025 10:54:24 +0530 Subject: [PATCH 18/21] add r2 download support - cnndm,deepseekr1 --- script/get-dataset-cnndm/customize.py | 33 +++++----- script/get-dataset-cnndm/meta.yaml | 66 ++++++++++++++++--- .../meta.yaml | 6 +- 3 files changed, 77 insertions(+), 28 deletions(-) diff --git a/script/get-dataset-cnndm/customize.py b/script/get-dataset-cnndm/customize.py index 0115d0e8c..103bb4b95 100644 --- a/script/get-dataset-cnndm/customize.py +++ b/script/get-dataset-cnndm/customize.py @@ -26,22 +26,23 @@ def preprocess(i): def postprocess(i): env = i['env'] - if env.get('MLC_TMP_ML_MODEL', '') != "llama3_1-8b": - if is_false(env.get('MLC_DATASET_CALIBRATION', '')): - env['MLC_DATASET_PATH'] = os.path.join(os.getcwd(), 'install') - env['MLC_DATASET_EVAL_PATH'] = os.path.join( - os.getcwd(), 'install', 'cnn_eval.json') - env['MLC_DATASET_CNNDM_EVAL_PATH'] = os.path.join( - os.getcwd(), 'install', 'cnn_eval.json') - env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_DATASET_PATH'] + if env.get('MLC_DOWNLOAD_MODE', '') != "dry": + if env.get('MLC_TMP_ML_MODEL', '') != "llama3_1-8b": + if is_false(env.get('MLC_DATASET_CALIBRATION', '')): + env['MLC_DATASET_PATH'] = os.path.join(os.getcwd(), 'install') + env['MLC_DATASET_EVAL_PATH'] = os.path.join( + os.getcwd(), 'install', 'cnn_eval.json') + env['MLC_DATASET_CNNDM_EVAL_PATH'] = os.path.join( + os.getcwd(), 'install', 'cnn_eval.json') + env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_DATASET_PATH'] + else: + env['MLC_CALIBRATION_DATASET_PATH'] = os.path.join( + os.getcwd(), 'install', 'cnn_dailymail_calibration.json') + env['MLC_CALIBRATION_DATASET_CNNDM_PATH'] = os.path.join( + os.getcwd(), 'install', 'cnn_dailymail_calibration.json') + env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_CALIBRATION_DATASET_PATH'] else: - env['MLC_CALIBRATION_DATASET_PATH'] = os.path.join( - os.getcwd(), 'install', 'cnn_dailymail_calibration.json') - env['MLC_CALIBRATION_DATASET_CNNDM_PATH'] = os.path.join( - os.getcwd(), 'install', 'cnn_dailymail_calibration.json') - env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_CALIBRATION_DATASET_PATH'] - else: - env['MLC_DATASET_CNNDM_EVAL_PATH'] = os.path.join( - env['MLC_DATASET_CNNDM_EVAL_PATH'], env['MLC_DATASET_CNNDM_FILENAME']) + env['MLC_DATASET_CNNDM_EVAL_PATH'] = os.path.join( + env['MLC_DATASET_CNNDM_EVAL_PATH'], env['MLC_DATASET_CNNDM_FILENAME']) return {'return': 0} diff --git a/script/get-dataset-cnndm/meta.yaml b/script/get-dataset-cnndm/meta.yaml index f1cd45335..6b37bfcae 100644 --- a/script/get-dataset-cnndm/meta.yaml +++ b/script/get-dataset-cnndm/meta.yaml @@ -58,10 +58,20 @@ variations: datacenter: group: category rclone: + prehook_deps: + - tags: get,rclone + enable_if_env: + MLC_TMP_REQUIRE_DOWNLOAD: + - yes group: download-tool add_deps_recursive: dae: tags: _rclone + r2_downloader: + group: download-tool + add_deps_recursive: + dae: + tags: _r2_downloader dry-run: group: run-mode env: @@ -69,18 +79,12 @@ variations: dry-run,rclone: env: MLC_DOWNLOAD_EXTRA_OPTIONS: --dry-run + dry-run,r2_downloader: + env: + MLC_DOWNLOAD_EXTRA_OPTIONS: -x mlc: group: download-src prehook_deps: - - tags: get,rclone - enable_if_env: - MLC_TMP_REQUIRE_DOWNLOAD: - - yes - - tags: get,rclone-config,_mlc-inference - force_cache: true - enable_if_env: - MLC_TMP_REQUIRE_DOWNLOAD: - - yes - enable_if_env: MLC_TMP_REQUIRE_DOWNLOAD: - 'yes' @@ -97,6 +101,13 @@ variations: - MLC_DOWNLOAD_URL env: MLC_DOWNLOAD_SRC: mlcommons + mlc,rclone: + prehook_deps: + - tags: get,rclone-config,_mlc-inference + force_cache: true + enable_if_env: + MLC_TMP_REQUIRE_DOWNLOAD: + - yes calibration: env: MLC_DATASET_CALIBRATION: 'yes' @@ -126,6 +137,15 @@ variations: MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/<<>> MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_DATASET_CNNDM_EVAL_PATH MLC_EXTRACT_FINAL_ENV_NAME: MLC_DATASET_CNNDM_EVAL_PATH + validation,edge,llama3,mlc,r2_downloader: + adr: + dae: + extra_cache_tags: cnndm,dataset,llama3,val,edge + env: + MLC_DATASET_CNNDM_FILENAME: sample_cnn_eval_5000.json + MLC_DOWNLOAD_URL: https://inference.mlcommons-storage.org/metadata/llama3-1-8b-sample-cnn-eval-5000.uri + MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_DATASET_CNNDM_EVAL_PATH + MLC_EXTRACT_FINAL_ENV_NAME: MLC_DATASET_CNNDM_EVAL_PATH validation,datacenter,llama3,mlc,rclone: adr: dae: @@ -135,6 +155,15 @@ variations: MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/<<>> MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_DATASET_CNNDM_EVAL_PATH MLC_EXTRACT_FINAL_ENV_NAME: MLC_DATASET_CNNDM_EVAL_PATH + validation,datacenter,llama3,mlc,r2_downlaoder: + adr: + dae: + extra_cache_tags: cnndm,dataset,llama3,val,datacenter + env: + MLC_DATASET_CNNDM_FILENAME: cnn_eval.json + MLC_DOWNLOAD_URL: https://inference.mlcommons-storage.org/metadata/llama3-1-8b-cnn-eval.uri + MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_DATASET_CNNDM_EVAL_PATH + MLC_EXTRACT_FINAL_ENV_NAME: MLC_DATASET_CNNDM_EVAL_PATH calibation,llama3,mlc,rclone: adr: dae: @@ -144,4 +173,21 @@ variations: MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/<<>> MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_CALIBRATION_DATASET_CNNDM_PATH MLC_EXTRACT_FINAL_ENV_NAME: MLC_CALIBRATION_DATASET_CNNDM_PATH - \ No newline at end of file + calibation,llama3,mlc,r2_downloader: + adr: + dae: + extra_cache_tags: cnndm,dataset,llama3,calib + env: + MLC_DATASET_CNNDM_FILENAME: cnn_dailymail_calibration.json + MLC_DOWNLOAD_URL: https://inference.mlcommons-storage.org/metadata/llama3-1-8b-cnn-dailymail-calibration.uri + MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_CALIBRATION_DATASET_CNNDM_PATH + MLC_EXTRACT_FINAL_ENV_NAME: MLC_CALIBRATION_DATASET_CNNDM_PATH +tests: + run_inputs: + - variations_list: + - validation,edge,rclone,llama3,mlc,dry-run + - validation,datacenter,rclone,llama3,mlc,dry-run + - validation,edge,r2_downloader,llama3,mlc,dry-run + - validation,datacenter,r2_downloader,llama3,mlc,dry-run + - calibration,rclone,llama3,mlc,dry-run + - calibration,r2_downloader,llama3,mlc,dry-run \ No newline at end of file diff --git a/script/get-preprocessed-dataset-mlperf-deepseek-r1/meta.yaml b/script/get-preprocessed-dataset-mlperf-deepseek-r1/meta.yaml index 3adcb0fd9..9c806fd51 100644 --- a/script/get-preprocessed-dataset-mlperf-deepseek-r1/meta.yaml +++ b/script/get-preprocessed-dataset-mlperf-deepseek-r1/meta.yaml @@ -30,7 +30,7 @@ variations: MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/deepseek_r1/mlperf_deepseek_r1_dataset_4388_fp8_eval.pkl validation,r2_downloader: env: - MLC_DOWNLOAD_URL: "" + MLC_DOWNLOAD_URL: "https://inference.mlcommons-storage.org/metadata/deepseek-r1-dataset-4388-fp8-eval.uri" calibration: group: dataset-type env: @@ -40,7 +40,7 @@ variations: MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/deepseek_r1/mlperf_deepseek_r1_calibration_dataset_500_fp8_eval.pkl calibration,r2_downloader: env: - MLC_DOWNLOAD_URL: "" + MLC_DOWNLOAD_URL: "https://inference.mlcommons-storage.org/metadata/deepseek-r1-calibration-dataset-500-fp8-eval.uri" mlc: group: download-src default: true @@ -95,4 +95,6 @@ tests: run_inputs: - variations_list: - validation,rclone,mlc,dry-run + - validation,r2_downloader,mlc,dry-run - calibration,rclone,mlc,dry-run + - calibration,r2_downloader,mlc,dry-run From 9dfe1e5e8913d9bc62c3453e2bad5c2eb7eb8122 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Thu, 10 Jul 2025 11:08:12 +0530 Subject: [PATCH 19/21] fixes --- script/get-dataset-cnndm/meta.yaml | 14 +++---- .../meta.yaml | 39 +++---------------- 2 files changed, 13 insertions(+), 40 deletions(-) diff --git a/script/get-dataset-cnndm/meta.yaml b/script/get-dataset-cnndm/meta.yaml index 6b37bfcae..f0affa3b1 100644 --- a/script/get-dataset-cnndm/meta.yaml +++ b/script/get-dataset-cnndm/meta.yaml @@ -82,6 +82,13 @@ variations: dry-run,r2_downloader: env: MLC_DOWNLOAD_EXTRA_OPTIONS: -x + mlc,rclone: + prehook_deps: + - tags: get,rclone-config,_mlc-inference + force_cache: true + enable_if_env: + MLC_TMP_REQUIRE_DOWNLOAD: + - yes mlc: group: download-src prehook_deps: @@ -101,13 +108,6 @@ variations: - MLC_DOWNLOAD_URL env: MLC_DOWNLOAD_SRC: mlcommons - mlc,rclone: - prehook_deps: - - tags: get,rclone-config,_mlc-inference - force_cache: true - enable_if_env: - MLC_TMP_REQUIRE_DOWNLOAD: - - yes calibration: env: MLC_DATASET_CALIBRATION: 'yes' diff --git a/script/get-preprocessed-dataset-mlperf-deepseek-r1/meta.yaml b/script/get-preprocessed-dataset-mlperf-deepseek-r1/meta.yaml index b32a3732e..8c0da0989 100644 --- a/script/get-preprocessed-dataset-mlperf-deepseek-r1/meta.yaml +++ b/script/get-preprocessed-dataset-mlperf-deepseek-r1/meta.yaml @@ -30,39 +30,11 @@ variations: MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/deepseek_r1/mlperf_deepseek_r1_dataset_4388_fp8_eval.pkl validation,r2_downloader: env: - MLC_DOWNLOAD_URL: "https://inference.mlcommons-storage.org/metadata/deepseek-r1-dataset-4388-fp8-eval.uri" + MLC_DOWNLOAD_URL: https://inference.mlcommons-storage.org/metadata/deepseek-r1-dataset-4388-fp8-eval.uri calibration: group: dataset-type env: MLC_PREPROCESSED_DATASET_TYPE: calibration - calibration,rclone: - env: - MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/deepseek_r1/mlperf_deepseek_r1_calibration_dataset_500_fp8_eval.pkl - calibration,r2_downloader: - env: - MLC_DOWNLOAD_URL: "https://inference.mlcommons-storage.org/metadata/deepseek-r1-calibration-dataset-500-fp8-eval.uri" - mlc: - group: download-src - default: true - prehook_deps: - - enable_if_env: - MLC_TMP_REQUIRE_DOWNLOAD: - - 'yes' - env: - MLC_DOWNLOAD_FINAL_ENV_NAME: MLC_PREPROCESSED_DATASET_DEEPSEEK_R1_PATH - MLC_EXTRACT_FINAL_ENV_NAME: MLC_PREPROCESSED_DATASET_DEEPSEEK_R1_PATH - extra_cache_tags: deepseek-r1,dataset - force_cache: true - names: - - dae - tags: download-and-extract - force_env_keys: - - MLC_OUTDIRNAME - update_tags_from_env_with_prefix: - _url.: - - MLC_DOWNLOAD_URL - env: - MLC_DOWNLOAD_SRC: mlcommons r2_downloader: group: download-tool add_deps_recursive: @@ -96,7 +68,7 @@ variations: MLC_DOWNLOAD_URL: mlc-inference:mlcommons-inference-wg-public/deepseek_r1/mlperf_deepseek_r1_calibration_dataset_500_fp8_eval.pkl calibration,r2_downloader: env: - MLC_DOWNLOAD_URL: "https://inference.mlcommons-storage.org/metadata/deepseek-r1-calibration-dataset-500-fp8-eval.uri" + MLC_DOWNLOAD_URL: https://inference.mlcommons-storage.org/metadata/deepseek-r1-calibration-dataset-500-fp8-eval.uri mlc: group: download-src default: true @@ -122,7 +94,8 @@ variations: tests: run_inputs: - variations_list: - - validation,rclone,mlc,dry-run - - validation,r2_downloader,mlc,dry-run - - calibration,rclone,mlc,dry-run - calibration,r2_downloader,mlc,dry-run + - validation,r2_downloader,mlc,dry-run + # - validation,rclone,mlc,dry-run + # - calibration,rclone,mlc,dry-run + From 4e2fc2e234cbdd79fcffeebc93626753ec4e5670 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Thu, 10 Jul 2025 11:33:22 +0530 Subject: [PATCH 20/21] fix rclone issue --- script/get-dataset-cnndm/meta.yaml | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/script/get-dataset-cnndm/meta.yaml b/script/get-dataset-cnndm/meta.yaml index f0affa3b1..99fea5f11 100644 --- a/script/get-dataset-cnndm/meta.yaml +++ b/script/get-dataset-cnndm/meta.yaml @@ -63,6 +63,11 @@ variations: enable_if_env: MLC_TMP_REQUIRE_DOWNLOAD: - yes + - tags: get,rclone-config,_mlc-inference + force_cache: true + enable_if_env: + MLC_TMP_REQUIRE_DOWNLOAD: + - yes group: download-tool add_deps_recursive: dae: @@ -82,13 +87,6 @@ variations: dry-run,r2_downloader: env: MLC_DOWNLOAD_EXTRA_OPTIONS: -x - mlc,rclone: - prehook_deps: - - tags: get,rclone-config,_mlc-inference - force_cache: true - enable_if_env: - MLC_TMP_REQUIRE_DOWNLOAD: - - yes mlc: group: download-src prehook_deps: From 984db0ce7f918562dd0c5573ff994109eb7b5533 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Thu, 10 Jul 2025 11:41:42 +0530 Subject: [PATCH 21/21] skip checking for rclone dry run --- script/get-dataset-cnndm/meta.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/script/get-dataset-cnndm/meta.yaml b/script/get-dataset-cnndm/meta.yaml index 99fea5f11..cf70343e3 100644 --- a/script/get-dataset-cnndm/meta.yaml +++ b/script/get-dataset-cnndm/meta.yaml @@ -183,9 +183,9 @@ variations: tests: run_inputs: - variations_list: - - validation,edge,rclone,llama3,mlc,dry-run - - validation,datacenter,rclone,llama3,mlc,dry-run + # - validation,edge,rclone,llama3,mlc,dry-run + # - validation,datacenter,rclone,llama3,mlc,dry-run - validation,edge,r2_downloader,llama3,mlc,dry-run - validation,datacenter,r2_downloader,llama3,mlc,dry-run - - calibration,rclone,llama3,mlc,dry-run - - calibration,r2_downloader,llama3,mlc,dry-run \ No newline at end of file + - calibration,r2_downloader,llama3,mlc,dry-run + # - calibration,rclone,llama3,mlc,dry-run \ No newline at end of file