diff --git a/script/app-mlperf-inference-nvidia/customize.py b/script/app-mlperf-inference-nvidia/customize.py index eddf02252..7d5371c9c 100644 --- a/script/app-mlperf-inference-nvidia/customize.py +++ b/script/app-mlperf-inference-nvidia/customize.py @@ -321,15 +321,30 @@ def preprocess(i): # path to which the data file is present target_data_path = os.path.join( env['MLPERF_SCRATCH_PATH'], - 'preprocessed_data', - 'open_orca') + 'data', + 'llama2-70b') # path to the dataset file target_data_file_path = os.path.join( + env['MLPERF_SCRATCH_PATH'], + 'data', + 'llama2-70b', + 'open_orca_gpt4_tokenized_llama.sampled_24576.pkl') + + preprocessed_data_for_accuracy_checker = os.path.join( env['MLPERF_SCRATCH_PATH'], 'preprocessed_data', 'open_orca', 'open_orca_gpt4_tokenized_llama.sampled_24576.pkl') + + if not env.get('LLAMA2_PRE_QUANTIZED_CHECKPOINT_PATH'): + target_calibration_data_file_path = os.path.join( + env['MLPERF_SCRATCH_PATH'], + 'data', + 'llama2-70b', + 'open_orca_gpt4_tokenized_llama.calibration_1000.pkl') + tmp_tp_size = env['MLC_NVIDIA_TP_SIZE'] + tmp_pp_size = env['MLC_NVIDIA_PP_SIZE'] if tmp_tp_size == "1": fp8_model_path = os.path.join( env['MLPERF_SCRATCH_PATH'], @@ -343,15 +358,35 @@ def preprocess(i): 'models', 'Llama2', 'fp8-quantized-ammo', - f'llama2-70b-chat-hf-tp{tmp_tp_size}pp1-fp8') + f'llama2-70b-chat-hf-tp{tmp_tp_size}pp{tmp_pp_size}-fp8') + + # check the presence of validation dataset if not os.path.exists(target_data_file_path): - if env.get('MLC_NVIDIA_LLAMA_DATASET_FILE_PATH', '') == '': + if env.get('MLC_DATASET_OPENORCA_PREPROCESSED_PATH', '') == '': return { - 'return': 1, 'error': 'Please specify the path to LLAMA2 dataset (pickle file)'} + 'return': 1, 'error': 'Llama2 70B validation dataset not present.'} if not os.path.exists(target_data_path): - cmds.append(f"mkdir {target_data_path}") + cmds.append(f"mkdir -p {target_data_path}") cmds.append( - f"ln -sf {env['MLC_NVIDIA_LLAMA_DATASET_FILE_PATH']} {target_data_file_path}") + f"ln -sf {env['MLC_DATASET_OPENORCA_PREPROCESSED_PATH']} {target_data_file_path}") + + # check the presence of calibration dataset + if not env.get('LLAMA2_PRE_QUANTIZED_CHECKPOINT_PATH'): + if not os.path.exists(target_calibration_data_file_path): + if env.get('MLC_DATASET_OPENORCA_CALIBRATION_PATH', '') == '': + return { + 'return': 1, 'error': 'Llama2 70B calibration dataset not present.'} + if not os.path.exists(target_data_path): + cmds.append(f"mkdir -p {target_data_path}") + cmds.append( + f"ln -sf {env['MLC_DATASET_OPENORCA_CALIBRATION_PATH']} {target_calibration_data_file_path}") + + if not os.path.exists(preprocessed_data_for_accuracy_checker): + if not os.path.exists(preprocessed_data_for_accuracy_checker): + cmds.append( + f"mkdir -p {os.path.dirname(preprocessed_data_for_accuracy_checker)}") + cmds.append( + f"ln -sf {env['MLC_DATASET_OPENORCA_PREPROCESSED_PATH']} {preprocessed_data_for_accuracy_checker}") model_name = "llama2-70b" model_path = fp8_model_path @@ -550,6 +585,11 @@ def preprocess(i): if gpu_inference_streams: run_config += f" --gpu_inference_streams={gpu_inference_streams}" + model_precision = env.get( + 'MLC_MLPERF_MODEL_PRECISION').replace('float', 'fp') + if model_precision: + run_config += f" --precision={model_precision}" + dla_copy_streams = env.get( 'MLC_MLPERF_NVIDIA_HARNESS_DLA_COPY_STREAMS') if dla_copy_streams: @@ -688,8 +728,12 @@ def preprocess(i): run_config += f" --use_fp8" if "llama2" in env["MLC_MODEL"]: - run_config += f" --fp8_quant_model_path={fp8_model_path}" - run_config += f" --tensor_parallelism={tmp_tp_size}" + run_config += f" --checkpoint_dir={fp8_model_path}" + if env.get('MLC_MLPERF_INFERENCE_POST_5_0'): + run_config += f" --trtllm_build_flags=tensor_parallelism:{tmp_tp_size},pipeline_parallelism:{tmp_pp_size}" + else: + run_config += f" --tensor_parallelism={tmp_tp_size}" + run_config += f" --pipeline_parallelism={tmp_pp_size}" enable_sort = env.get('MLC_MLPERF_NVIDIA_HARNESS_ENABLE_SORT') if enable_sort and not is_false(enable_sort): @@ -757,9 +801,11 @@ def preprocess(i): hpcx_paths.append("/opt/hpcx/ucx/lib") if os.path.exists("/opt/hpcx/ucc/lib"): hpcx_paths.append("/opt/hpcx/ucc/lib") + if os.path.exists("/opt/hpcx/ompi/lib"): + hpcx_paths.append("/opt/hpcx/ompi/lib") env['+LD_LIBRARY_PATH'] = hpcx_paths + env['+LD_LIBRARY_PATH'] - + env['+PYTHONPATH'] = [] # print(env) return {'return': 0} diff --git a/script/app-mlperf-inference-nvidia/meta.yaml b/script/app-mlperf-inference-nvidia/meta.yaml index 6dda8d4a5..479020929 100644 --- a/script/app-mlperf-inference-nvidia/meta.yaml +++ b/script/app-mlperf-inference-nvidia/meta.yaml @@ -355,6 +355,8 @@ variations: group: batchsize-format-change v5.0+: group: batchsize-format-change + env: + MLC_MLPERF_INFERENCE_POST_5_0: "yes" v5.0: base: - v5.0+ @@ -1279,13 +1281,17 @@ variations: MLC_MLPERF_NVIDIA_HARNESS_NUM_SORT_SEGMENTS: '2' MLC_MLPERF_NVIDIA_HARNESS_SKIP_POSTPROCESS: True - gpu_memory.80,pre5.0,num-gpus.2,llama2-70b,offline,run_harness: + gpu_memory.80,pre5.0,num-gpus.2,llama2-70b_,offline,run_harness: default_variations: batch-size: batch_size.896 - gpu_memory.80,v5.0+,num-gpus.2,llama2-70b,offline,run_harness: + gpu_memory.80,v5.0+,num-gpus.2,llama2-70b_,offline,run_harness: default_variations: batch-size: batch_size."llama2-70b:1024" + + gpu_memory.80,v5.0+,num-gpus.8,llama2-70b_,offline,run_harness: + default_variations: + batch-size: batch_size."llama2-70b:4096" gpu_memory.16,pre5.0,gptj_,offline,run_harness: default_variations: diff --git a/script/app-mlperf-inference/meta.yaml b/script/app-mlperf-inference/meta.yaml index f9ad1283f..a0752e8fd 100644 --- a/script/app-mlperf-inference/meta.yaml +++ b/script/app-mlperf-inference/meta.yaml @@ -28,6 +28,8 @@ default_env: MLC_TEST_QUERY_COUNT: '10' MLC_MLPERF_QUANTIZATION: off MLC_GET_PLATFORM_DETAILS: no + MLC_NVIDIA_TP_SIZE: "2" + MLC_NVIDIA_PP_SIZE: "1" env: MLC_MLPERF_PRINT_SUMMARY: "no" @@ -62,8 +64,8 @@ input_mapping: readme: MLC_MLPERF_README debug: MLC_DEBUG_SCRIPT_BENCHMARK_PROGRAM gpu_name: MLC_NVIDIA_GPU_NAME - nvidia_llama2_dataset_file_path: MLC_NVIDIA_LLAMA_DATASET_FILE_PATH tp_size: MLC_NVIDIA_TP_SIZE + pp_size: MLC_NVIDIA_PP_SIZE use_dataset_from_host: MLC_USE_DATASET_FROM_HOST predeps: False @@ -324,9 +326,21 @@ variations: MLC_MLPERF_NVIDIA_SKIP_GPTJ: - "yes" - tags: get,ml-model,llama2-70b,_nvidia,_fp8 + names: + - llama2-model update_tags_from_env_with_prefix: _tp-size.: - MLC_NVIDIA_TP_SIZE + _pp-size.: + - MLC_NVIDIA_PP_SIZE + skip_if_env: + MLC_MLPERF_NVIDIA_SKIP_LLAMA2_70B: + - "yes" + - tags: get,dataset,preprocessed,openorca,_calibration,_mlcommons,_nvidia + skip_if_env: + MLC_MLPERF_NVIDIA_SKIP_LLAMA2_70B: + - "yes" + - tags: get,dataset,preprocessed,openorca,_validation,_mlcommons,_nvidia skip_if_env: MLC_MLPERF_NVIDIA_SKIP_LLAMA2_70B: - "yes" @@ -505,9 +519,15 @@ variations: image_name: mlperf-inference-nvidia-v4.1-dev-llm deps: - tags: get,ml-model,llama2-70b,_nvidia,_fp8 + names: + - llama2-model update_tags_from_env_with_prefix: _tp-size.: - MLC_NVIDIA_TP_SIZE + _pp-size.: + - MLC_NVIDIA_PP_SIZE + - tags: get,dataset,preprocessed,openorca,_calibration,_mlcommons,_nvidia + - tags: get,dataset,preprocessed,openorca,_validation,_mlcommons,_nvidia env: BUILD_TRTLLM: 1 @@ -515,9 +535,15 @@ variations: docker: deps: - tags: get,ml-model,llama2-70b,_nvidia,_fp8 + names: + - llama2-model update_tags_from_env_with_prefix: _tp-size.: - MLC_NVIDIA_TP_SIZE + _pp-size.: + - MLC_NVIDIA_PP_SIZE + - tags: get,dataset,preprocessed,openorca,_calibration,_mlcommons,_nvidia + - tags: get,dataset,preprocessed,openorca,_validation,_mlcommons,_nvidia env: BUILD_TRTLLM: 1 @@ -525,9 +551,33 @@ variations: docker: deps: - tags: get,ml-model,llama2-70b,_nvidia,_fp8 + names: + - llama2-model + update_tags_from_env_with_prefix: + _tp-size.: + - MLC_NVIDIA_TP_SIZE + _pp-size.: + - MLC_NVIDIA_PP_SIZE + - tags: get,dataset,preprocessed,openorca,_calibration,_mlcommons,_nvidia + - tags: get,dataset,preprocessed,openorca,_validation,_mlcommons,_nvidia + + nvidia-original,r5.1-dev_default,llama2-70b_: + default_variations: + precision: float8 + docker: + deps: + - tags: get,ml-model,llama2-70b,_nvidia,_fp8,_v5.0 + names: + - llama2-model update_tags_from_env_with_prefix: _tp-size.: - MLC_NVIDIA_TP_SIZE + _pp-size.: + - MLC_NVIDIA_PP_SIZE + - tags: get,dataset,preprocessed,openorca,_calibration,_mlcommons,_nvidia + - tags: get,dataset,preprocessed,openorca,_validation,_mlcommons,_nvidia + env: + BUILD_TRTLLM: 1 nvidia-original: docker: @@ -594,6 +644,8 @@ variations: update_tags_from_env_with_prefix: "_gpu_memory." : - MLC_NVIDIA_GPU_MEMORY + "_num-gpus.": + - MLC_CUDA_NUM_DEVICES update_tags_from_env: - MLC_NVIDIA_HARNESS_GPU_VARIATION @@ -1293,6 +1345,16 @@ variations: MLC_USE_MODEL_FROM_HOST: - 'yes' tags: get,ml-model,llama2 + names: + - llama2-model + - tags: get,dataset,preprocessed,openorca,_calibration,_mlcommons + enable_if_any_env: + MLC_USE_DATASET_FROM_HOST: + - 'yes' + - tags: get,dataset,preprocessed,openorca,_validation,_mlcommons + enable_if_any_env: + MLC_USE_DATASET_FROM_HOST: + - 'yes' llama2-70b_,amd: docker: @@ -1306,6 +1368,8 @@ variations: MLC_USE_MODEL_FROM_HOST: - 'yes' tags: get,ml-model,llama2,_amd,_pytorch + names: + - llama2-model mixtral-8x7b: group: @@ -1830,6 +1894,12 @@ variations: fp32: alias: float32 + fp4: + alias: float4 + + fp8: + alias: float8 + float32: group: precision default: true @@ -1842,6 +1912,16 @@ variations: kilt-harness: tags: _fp32 + float4: + group: precision + env: + MLC_MLPERF_MODEL_PRECISION: float4 + + float8: + group: precision + env: + MLC_MLPERF_MODEL_PRECISION: float8 + float16: group: precision env: @@ -2128,10 +2208,10 @@ variations: reproducibility add_deps_recursive: nvidia-inference-common-code: - tags: _custom,_v5.1-dev + tags: _mlcommons,_v5.1-dev nvidia-inference-server: version: r5.0 - tags: _custom + tags: _mlcommons nvidia-harness: tags: _v5.0 intel-harness: @@ -2285,6 +2365,9 @@ docker: - "${{ GPTJ_CHECKPOINT_PATH }}:${{ GPTJ_CHECKPOINT_PATH }}" - "${{ MLC_CRITEO_PREPROCESSED_PATH }}:${{ MLC_CRITEO_PREPROCESSED_PATH }}" - "${{ LLAMA2_CHECKPOINT_PATH }}:${{ LLAMA2_CHECKPOINT_PATH }}" + - "${{ LLAMA2_PRE_QUANTIZED_CHECKPOINT_PATH }}:${{ LLAMA2_PRE_QUANTIZED_CHECKPOINT_PATH }}" + - "${{ MLC_DATASET_OPENORCA_PREPROCESSED_PATH }}:${{ MLC_DATASET_OPENORCA_PREPROCESSED_PATH }}" + - "${{ MLC_DATASET_OPENORCA_CALIBRATION_PATH }}:${{ MLC_DATASET_OPENORCA_CALIBRATION_PATH }}" - "${{ MLC_NVIDIA_LLAMA_DATASET_FILE_PATH }}:${{ MLC_NVIDIA_LLAMA_DATASET_FILE_PATH }}" - "${{ SDXL_CHECKPOINT_PATH }}:${{ SDXL_CHECKPOINT_PATH }}" - "${{ MLC_DATASET_KITS19_PREPROCESSED_PATH }}:${{ MLC_DATASET_KITS19_PREPROCESSED_PATH }}" @@ -2314,3 +2397,4 @@ docker: intel_gptj_int8_model_path: MLC_MLPERF_INFERENCE_INTEL_GPTJ_INT8_MODEL_PATH nvidia_llama2_dataset_file_path: MLC_NVIDIA_LLAMA_DATASET_FILE_PATH tp_size: MLC_NVIDIA_TP_SIZE + pp_size: MLC_NVIDIA_PP_SIZE diff --git a/script/build-mlperf-inference-server-nvidia/customize.py b/script/build-mlperf-inference-server-nvidia/customize.py index 5fa70aa45..f6a2f399f 100644 --- a/script/build-mlperf-inference-server-nvidia/customize.py +++ b/script/build-mlperf-inference-server-nvidia/customize.py @@ -1,6 +1,7 @@ from mlc import utils import os import shutil +from utils import * def preprocess(i): @@ -18,6 +19,15 @@ def preprocess(i): env['+LIBRARY_PATH'].append(os.path.join( env['MLC_TENSORRT_INSTALL_PATH'], "lib")) + if is_true(env.get('BUILD_TRTLLM')): + hpcx_paths = [] + if os.path.exists("/opt/hpcx/ucx/lib"): + hpcx_paths.append("/opt/hpcx/ucx/lib") + if os.path.exists("/opt/hpcx/ucc/lib"): + hpcx_paths.append("/opt/hpcx/ucc/lib") + if os.path.exists("/opt/hpcx/ompi/lib"): + hpcx_paths.append("/opt/hpcx/ompi/lib") + cxxflags = [ "-Wno-error=switch", "-DDALI_1_15=1", @@ -38,6 +48,8 @@ def preprocess(i): env['+ CXXFLAGS'] = [] env['+ CXXFLAGS'] += cxxflags + env['+LD_LIBRARY_PATH'] = hpcx_paths + env['+LD_LIBRARY_PATH'] + env['+PYTHONPATH'] = [] return {'return': 0} diff --git a/script/build-mlperf-inference-server-nvidia/meta.yaml b/script/build-mlperf-inference-server-nvidia/meta.yaml index 8c57fd809..8b1a2a0e5 100644 --- a/script/build-mlperf-inference-server-nvidia/meta.yaml +++ b/script/build-mlperf-inference-server-nvidia/meta.yaml @@ -78,7 +78,7 @@ deps: # Detect MLCake - tags: get,cmake - version: "3.25.1" + version: "3.26.4" # Detect Google Logger - tags: get,generic,sys-util,_glog-dev @@ -101,6 +101,16 @@ deps: # Detect rapidjson-dev - tags: get,generic,sys-util,_rapidjson-dev + - tags: get,generic,sys-util,_ccache + enable_if_env: + BUILD_TRTLLM: + - 'yes' + + - tags: get,generic,sys-util,_git-lfs + enable_if_env: + BUILD_TRTLLM: + - 'yes' + # Download Nvidia Submission Code - tags: get,nvidia,mlperf,inference,common-code diff --git a/script/get-generic-sys-util/meta.yaml b/script/get-generic-sys-util/meta.yaml index cbb1bd9fc..77c48ef98 100644 --- a/script/get-generic-sys-util/meta.yaml +++ b/script/get-generic-sys-util/meta.yaml @@ -314,6 +314,20 @@ variations: brew: git-lfs dnf: git-lfs yum: git-lfs + ccache: + env: + MLC_SYS_UTIL_NAME: ccache + MLC_SYS_UTIL_VERSION_CMD: ccache --version + MLC_SYS_UTIL_VERSION_RE: "ccache version ([0-9.]+)" + MLC_TMP_VERSION_DETECT_GROUP_NUMBER: 0 + new_env_keys: + - MLC_CCACHE_VERSION + state: + ccache: + apt: ccache + brew: ccache + dnf: ccache + yum: ccache glog-dev: env: MLC_SYS_UTIL_NAME: glog-dev diff --git a/script/get-git-repo/run.sh b/script/get-git-repo/run.sh index 0e0c19324..2d914ae84 100644 --- a/script/get-git-repo/run.sh +++ b/script/get-git-repo/run.sh @@ -64,7 +64,7 @@ if [ ! -z ${MLC_GIT_PR_TO_APPLY} ]; then git fetch origin ${MLC_GIT_PR_TO_APPLY}:tmp-apply fi -IFS=',' read -r -a cherrypicks <<< "${MLC_GIT_CHERRYPICKS}" +IFS=';' read -r -a cherrypicks <<< "${MLC_GIT_CHERRYPICKS}" for cherrypick in "${cherrypicks[@]}" do echo "" @@ -73,18 +73,18 @@ do test $? -eq 0 || exit $? done -IFS=',' read -r -a submodules <<< "${MLC_GIT_SUBMODULES}" +IFS=';' read -r -a submodules <<< "${MLC_GIT_SUBMODULES}" for submodule in "${submodules[@]}" do echo "" echo "Initializing submodule ${submodule}" - git submodule update --init "${submodule}" + git submodule update --init --recursive --checkout --force "${submodule}" test $? -eq 0 || exit $? done if [ ${MLC_GIT_PATCH} == "yes" ]; then - IFS=', ' read -r -a patch_files <<< ${MLC_GIT_PATCH_FILEPATHS} + IFS=';' read -r -a patch_files <<< ${MLC_GIT_PATCH_FILEPATHS} for patch_file in "${patch_files[@]}" do echo "" diff --git a/script/get-ml-model-llama2/customize.py b/script/get-ml-model-llama2/customize.py index 491e76248..3a5e43232 100644 --- a/script/get-ml-model-llama2/customize.py +++ b/script/get-ml-model-llama2/customize.py @@ -1,4 +1,5 @@ from mlc import utils +from utils import is_true import os @@ -8,13 +9,17 @@ def preprocess(i): env = i['env'] if env.get('MLC_TMP_ML_MODEL_PROVIDER', '') == 'nvidia': - i['run_script_input']['script_name'] = 'run-nvidia' - gpu_arch = int( - float( - env['MLC_CUDA_DEVICE_PROP_GPU_COMPUTE_CAPABILITY']) * - 10) - env['MLC_GPU_ARCH'] = gpu_arch - env['MLC_TMP_REQUIRE_DOWNLOAD'] = 'no' + if is_true(env.get('MLC_ML_MODEL_QUANTIZE_LOCALLY')): + i['run_script_input']['script_name'] = 'run-nvidia' + gpu_arch = int( + float( + env['MLC_CUDA_DEVICE_PROP_GPU_COMPUTE_CAPABILITY']) * + 10) + env['MLC_GPU_ARCH'] = gpu_arch + env['MLC_TMP_REQUIRE_DOWNLOAD'] = 'no' + else: + run_cmd = f"ln -sf {env['LLAMA2_CHECKPOINT_PATH']} {env['MLC_NVIDIA_MLPERF_SCRATCH_PATH']}/models/Llama2/fp8-quantized-ammo/llama-2-70b-chat-hf-tp{env['MLC_NVIDIA_TP_SIZE']}pp{env['MLC_NVIDIA_PP_SIZE']}-{env['MLC_ML_MODEL_PRECISION']}" + env['MLC_RUN_CMD'] = run_cmd else: path = env.get('LLAMA2_CHECKPOINT_PATH', '').strip() @@ -37,11 +42,14 @@ def postprocess(i): env = i['env'] if env.get('MLC_DOWNLOAD_MODE', '') != "dry": - if env.get('LLAMA2_CHECKPOINT_PATH', '') == '': - env['LLAMA2_CHECKPOINT_PATH'] = env['MLC_ML_MODEL_PATH'] + if is_true(env.get('MLC_ML_MODEL_QUANTIZE_LOCALLY')): + if env.get('LLAMA2_CHECKPOINT_PATH', '') == '': + env['LLAMA2_CHECKPOINT_PATH'] = env['MLC_ML_MODEL_PATH'] + else: + env['MLC_ML_MODEL_PATH'] = env['LLAMA2_CHECKPOINT_PATH'] + env['MLC_ML_MODEL_LLAMA2_FILE_WITH_PATH'] = env['LLAMA2_CHECKPOINT_PATH'] + env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_ML_MODEL_PATH'] else: - env['MLC_ML_MODEL_PATH'] = env['LLAMA2_CHECKPOINT_PATH'] - env['MLC_ML_MODEL_LLAMA2_FILE_WITH_PATH'] = env['LLAMA2_CHECKPOINT_PATH'] - env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_ML_MODEL_PATH'] + env['LLAMA2_PRE_QUANTIZED_CHECKPOINT_PATH'] = env['LLAMA2_CHECKPOINT_PATH'] return {'return': 0} diff --git a/script/get-ml-model-llama2/meta.yaml b/script/get-ml-model-llama2/meta.yaml index a2b0c3c1e..45895030f 100644 --- a/script/get-ml-model-llama2/meta.yaml +++ b/script/get-ml-model-llama2/meta.yaml @@ -15,6 +15,7 @@ new_env_keys: - LLAMA2_CHECKPOINT_PATH - MLC_NVIDIA_TP_SIZE - MLC_LLAMA2_FINAL_SAFE_TENSORS_PATH +- LLAMA2_PRE_QUANTIZED_CHECKPOINT_PATH prehook_deps: - enable_if_env: MLC_TMP_REQUIRE_DOWNLOAD: @@ -48,10 +49,15 @@ tests: - r2-downloader,70b,mlc,dry-run - r2-downloader,7b,mlc,dry-run variations: - L40s: + pre-quantized: + group: quantization env: - MLC_NVIDIA_TP_SIZE: 4 - group: gpu + MLC_ML_MODEL_PRE_QUANTIZED: 'yes' + quantize-locally: + default: true + group: quantization + env: + MLC_ML_MODEL_QUANTIZE_LOCALLY: 'yes' amd: default_env: MLC_LLAMA2_QUANTIZATION_DEVICE: '' @@ -80,10 +86,6 @@ variations: MLC_ML_MODEL_PRECISION: fp8 MLC_ML_MODEL_WEIGHT_DATA_TYPES: fp8 group: precision - generic: - env: - MLC_NVIDIA_TP_SIZE: 2 - group: gpu int8: env: MLC_ML_MODEL_INPUT_DATA_TYPES: int8 @@ -142,18 +144,21 @@ variations: update_tags_from_env_with_prefix: _url.: - MLC_DOWNLOAD_URL - mlc,rclone,70b: + mlc,rclone,70b,quantize-locally: env: MLC_DOWNLOAD_URL: mlc-llama2:Llama-2-70b-chat-hf - mlc,rclone,7b: + mlc,rclone,7b,quantize-locally: env: MLC_DOWNLOAD_URL: mlc-llama2:Llama-2-7b-chat-hf - mlc,r2-downloader,70b: + mlc,r2-downloader,70b,quantize-locally: env: MLC_DOWNLOAD_URL: https://llama2.mlcommons-storage.org/metadata/llama-2-70b-chat-hf.uri - mlc,r2-downloader,7b: + mlc,r2-downloader,7b,quantize-locally: env: MLC_DOWNLOAD_URL: https://llama2.mlcommons-storage.org/metadata/llama-2-7b-chat-hf.uri + mlc,r2-downloader,70b,pre-quantized,fp8: + env: + MLC_DOWNLOAD_URL: https://llama2.mlcommons-storage.org/metadata/llama-2-70b-chat-hf-tp<>pp<>-<<>>.uri hf: group: download-source env: @@ -203,6 +208,10 @@ variations: framework: pytorch env: MLC_TMP_ML_MODEL_PROVIDER: nvidia + deps: + - tags: get,nvidia,scratch,space + names: + - mlperf-inference-nvidia-scratch-space group: model-provider pytorch: default: true @@ -233,19 +242,21 @@ variations: - tags: get,generic-python-lib,_package.compressed_tensors pytorch,fp32: env: {} - pytorch,nvidia: + pytorch,nvidia,v5.0: + deps: + - env: + MLC_GIT_CHECKOUT_PATH_ENV_NAME: MLC_TENSORRT_LLM_CHECKOUT_PATH + extra_cache_tags: tensorrt-llm + tags: get,git,repo,_repo.https://github.com/NVIDIA/TensorRT-LLM.git,_sha.2ea17cdad28bed0f30e80eea5b1380726a7c6493,_submodules.3rdparty/NVTX;3rdparty/cutlass;3rdparty/cxxopts;3rdparty/json;3rdparty/pybind11;3rdparty/ucxx;3rdparty/xgrammar + pytorch,nvidia,quantize-locally: default_variations: - gpu: generic precision: fp8 + tp-size: tp-size.2 + pp-size: pp-size.1 deps: - - env: - MLC_GIT_CHECKOUT_PATH_ENV_NAME: MLC_TENSORRT_LLM_CHECKOUT_PATH - extra_cache_tags: tensorrt-llm - tags: get,git,repo,_repo.https://github.com/NVIDIA/TensorRT-LLM.git,_sha.0ab9d17a59c284d2de36889832fe9fc7c8697604 - names: - cuda tags: get,cuda - - tags: get,nvidia,scratch,space - tags: get,cuda-devices,_with-pycuda - env: {} force_new_env_keys: @@ -254,6 +265,7 @@ variations: - names: - nvidia-inference-common-code tags: get,nvidia,inference,common-code + - tags: get,preprocessed,dataset,openorca,_calibration,_mlc,_nvidia - names: - python - python3 @@ -268,10 +280,17 @@ variations: tp-size.#: env: MLC_NVIDIA_TP_SIZE: '#' - group: gpu + group: tp-size + pp-size.#: + env: + MLC_NVIDIA_PP_SIZE: '#' + group: pp-size uint8: env: MLC_ML_MODEL_INPUT_DATA_TYPES: uint8 MLC_ML_MODEL_PRECISION: uint8 MLC_ML_MODEL_WEIGHT_DATA_TYPES: uint8 group: precision + v5.0: + group: version + default: true diff --git a/script/get-ml-model-llama2/run-nvidia.sh b/script/get-ml-model-llama2/run-nvidia.sh index d38e911cb..ffabbe4c6 100644 --- a/script/get-ml-model-llama2/run-nvidia.sh +++ b/script/get-ml-model-llama2/run-nvidia.sh @@ -1,12 +1,12 @@ #!/bin/bash echo "Set tp size is ${MLC_NVIDIA_TP_SIZE}" +echo "Set pp size is ${MLC_NVIDIA_PP_SIZE}" if [[ ! -e ${MLC_NVIDIA_MLPERF_SCRATCH_PATH}/models/Llama2/Llama-2-70b-chat-hf ]]; then mkdir -p ${MLC_NVIDIA_MLPERF_SCRATCH_PATH}/models/Llama2/Llama-2-70b-chat-hf cd ${LLAMA2_CHECKPOINT_PATH} cp -r ${LLAMA2_CHECKPOINT_PATH}/* ${MLC_NVIDIA_MLPERF_SCRATCH_PATH}/models/Llama2/Llama-2-70b-chat-hf - test $? -eq 0 || exit $? fi echo "cd ${MLC_TENSORRT_LLM_CHECKOUT_PATH}" @@ -16,11 +16,14 @@ make -C docker build test $? -eq 0 || exit $? if [ "${MLC_NVIDIA_TP_SIZE}" -eq 1 ]; then - RUN_CMD="bash -c 'python3 scripts/build_wheel.py -a=${MLC_GPU_ARCH} --clean --install --trt_root /usr/local/tensorrt/ && python examples/quantization/quantize.py --dtype=float16 --output_dir=/mnt/models/Llama2/fp8-quantized-ammo/llama2-70b-chat-hf-tp${MLC_NVIDIA_TP_SIZE}pp1-fp8-02072024 --model_dir=/mnt/models/Llama2/Llama-2-70b-chat-hf --qformat=fp8 --kv_cache_dtype=fp8 --tp_size ${MLC_NVIDIA_TP_SIZE}'" + RUN_CMD="bash -c 'git lfs install && git lfs pull && python3 scripts/build_wheel.py -a=${MLC_GPU_ARCH} --clean --install --use_ccache --benchmarks --trt_root /usr/local/tensorrt/ && python examples/quantization/quantize.py --dtype=float16 --output_dir=/mnt/models/Llama2/fp8-quantized-ammo/llama2-70b-chat-hf-tp${MLC_NVIDIA_TP_SIZE}pp${MLC_NVIDIA_PP_SIZE}-fp8-02072024 --model_dir=/mnt/models/Llama2/Llama-2-70b-chat-hf --qformat=fp8 --kv_cache_dtype=fp8 --tp_size ${MLC_NVIDIA_TP_SIZE} --pp_size ${MLC_NVIDIA_PP_SIZE} --calib_dataset=/calib_dataset'" + echo "$RUN_CMD" else - RUN_CMD="bash -c 'python3 scripts/build_wheel.py -a=${MLC_GPU_ARCH} --clean --install --trt_root /usr/local/tensorrt/ && python examples/quantization/quantize.py --dtype=float16 --output_dir=/mnt/models/Llama2/fp8-quantized-ammo/llama2-70b-chat-hf-tp${MLC_NVIDIA_TP_SIZE}pp1-fp8 --model_dir=/mnt/models/Llama2/Llama-2-70b-chat-hf --qformat=fp8 --kv_cache_dtype=fp8 --tp_size ${MLC_NVIDIA_TP_SIZE}'" + RUN_CMD="bash -c 'git lfs install && git lfs pull && python3 scripts/build_wheel.py -a=${MLC_GPU_ARCH} --clean --install --use_ccache --benchmarks --trt_root /usr/local/tensorrt/ && python examples/quantization/quantize.py --dtype=float16 --output_dir=/mnt/models/Llama2/fp8-quantized-ammo/llama2-70b-chat-hf-tp${MLC_NVIDIA_TP_SIZE}pp${MLC_NVIDIA_PP_SIZE}-fp8 --model_dir=/mnt/models/Llama2/Llama-2-70b-chat-hf --qformat=fp8 --kv_cache_dtype=fp8 --tp_size ${MLC_NVIDIA_TP_SIZE} --pp_size ${MLC_NVIDIA_PP_SIZE} --calib_dataset=/calib_dataset'" + echo "$RUN_CMD" fi -DOCKER_RUN_ARGS=" -v ${MLC_NVIDIA_MLPERF_SCRATCH_PATH}:/mnt" +# TODO: check whether --device nvidia.com/gpu=all would work for docker +DOCKER_RUN_ARGS=" -v ${MLC_NVIDIA_MLPERF_SCRATCH_PATH}:/mnt -v ${MLC_NVIDIA_PREPROCESSED_CALIBRATION_DATASET_PATH}:/calib_dataset -u $(id -u):$(id -g) --userns=keep-id --device nvidia.com/gpu=all -e NVIDIA_VISIBLE_DEVICES=all" export DOCKER_RUN_ARGS="$DOCKER_RUN_ARGS" export RUN_CMD="$RUN_CMD" make -C docker run LOCAL_USER=1 diff --git a/script/get-ml-model-llama2/run.sh b/script/get-ml-model-llama2/run.sh new file mode 100644 index 000000000..5c87d5618 --- /dev/null +++ b/script/get-ml-model-llama2/run.sh @@ -0,0 +1,3 @@ +echo "${MLC_RUN_CMD}" +eval ${MLC_RUN_CMD} +test $? -eq 0 || exit $? \ No newline at end of file diff --git a/script/get-preprocessed-dataset-openorca/customize.py b/script/get-preprocessed-dataset-openorca/customize.py index daeef32f4..a61128618 100644 --- a/script/get-preprocessed-dataset-openorca/customize.py +++ b/script/get-preprocessed-dataset-openorca/customize.py @@ -9,21 +9,16 @@ def preprocess(i): env = i['env'] if is_true(str(env.get('MLC_DATASET_PREPROCESSED_BY_MLC', ''))): - run_dir = os.getcwd() - if is_true(env.get('MLC_DATASET_CALIBRATION', '')): - env['MLC_DATASET_CALIBRATION_PATH'] = os.path.join( - env['MLC_OPENORCA_PREPROCESSED_ROOT'], - "open_orca_gpt4_tokenized_llama.calibration_1000.pkl.gz") - env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_DATASET_CALIBRATION_PATH'] - env['MLC_DATASET_OPENORCA_CALIBRATION_PATH'] = env['MLC_DATASET_CALIBRATION_PATH'] - else: - env['MLC_DATASET_PREPROCESSED_PATH'] = os.path.join( - env['MLC_OPENORCA_PREPROCESSED_ROOT'], - "open_orca_gpt4_tokenized_llama.sampled_24576.pkl.gz") - env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_DATASET_PREPROCESSED_PATH'] - env['MLC_DATASET_OPENORCA_PREPROCESSED_PATH'] = env['MLC_DATASET_PREPROCESSED_PATH'] - # run_cmd = f"gunzip -k {env['MLC_DATASET_PREPROCESSED_PATH']}" + run_dir = env['MLC_OPENORCA_PREPROCESSED_ROOT'] run_cmd = '' + env['MLC_DATASET_CALIBRATION_PATH'] = os.path.join( + env['MLC_OPENORCA_PREPROCESSED_ROOT'], + "open_orca_gpt4_tokenized_llama.calibration_1000.pkl.gz") + run_cmd = f"gzip -dkf {env['MLC_DATASET_CALIBRATION_PATH']}" + env['MLC_DATASET_PREPROCESSED_PATH'] = os.path.join( + env['MLC_OPENORCA_PREPROCESSED_ROOT'], + "open_orca_gpt4_tokenized_llama.sampled_24576.pkl.gz") + run_cmd += f" && gzip -dkf {env['MLC_DATASET_PREPROCESSED_PATH']}" else: inference_src = env['MLC_MLPERF_INFERENCE_SOURCE'] run_dir = os.path.join(inference_src, 'language', 'llama2-70b') @@ -52,4 +47,26 @@ def preprocess(i): def postprocess(i): env = i['env'] + if is_true(str(env.get('MLC_DATASET_PREPROCESSED_BY_MLC', ''))): + env['PREPROCESSED_DATA_DIR'] = os.path.dirname( + env['MLC_OPENORCA_PREPROCESSED_ROOT']) + if is_true(env.get('MLC_DATASET_CALIBRATION', '')): + env['MLC_DATASET_CALIBRATION_PATH'] = os.path.join( + env['MLC_OPENORCA_PREPROCESSED_ROOT'], + "open_orca_gpt4_tokenized_llama.calibration_1000.pkl") + if env.get('MLC_TMP_DATASET_PREPROCESS_STEP_PROVIDER', + '') == "nvidia": + env['MLC_NVIDIA_PREPROCESSED_CALIBRATION_DATASET_PATH'] = os.path.join( + env['MLC_OPENORCA_PREPROCESSED_ROOT'], + "preprocessed_data", + "mlperf_llama2_openorca_calibration_1k") + env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_DATASET_CALIBRATION_PATH'] + env['MLC_DATASET_OPENORCA_CALIBRATION_PATH'] = env['MLC_DATASET_CALIBRATION_PATH'] + else: + env['MLC_DATASET_PREPROCESSED_PATH'] = os.path.join( + env['MLC_OPENORCA_PREPROCESSED_ROOT'], + "open_orca_gpt4_tokenized_llama.sampled_24576.pkl") + env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_DATASET_PREPROCESSED_PATH'] + env['MLC_DATASET_OPENORCA_PREPROCESSED_PATH'] = env['MLC_DATASET_PREPROCESSED_PATH'] + return {'return': 0} diff --git a/script/get-preprocessed-dataset-openorca/meta.yaml b/script/get-preprocessed-dataset-openorca/meta.yaml index 5e9f1a83d..dbcf7e46a 100644 --- a/script/get-preprocessed-dataset-openorca/meta.yaml +++ b/script/get-preprocessed-dataset-openorca/meta.yaml @@ -8,6 +8,10 @@ default_env: MLC_DATASET_CALIBRATION: 'no' deps: - tags: get,sys-utils-mlc + skip_if_env: + MLC_DATASET_PREPROCESSED_BY_MLC: + - 'on' + - 'yes' - names: - python - python3 @@ -38,6 +42,30 @@ deps: - names: - transformers tags: get,generic-python-lib,_package.transformers +- names: + - datasets + tags: get,generic-python-lib,_package.datasets + version_max: 2.19.2 + enable_if_env: + MLC_DATASET_PREPROCESSED_BY_MLC: + - 'on' + - 'yes' +- names: + - numpy + tags: get,generic-python-lib,_package.numpy + version_max: 1.26.4 + enable_if_env: + MLC_DATASET_PREPROCESSED_BY_MLC: + - 'on' + - 'yes' +- names: + - pandas + tags: get,generic-python-lib,_package.pandas + version_max: 2.2.2 + enable_if_env: + MLC_DATASET_PREPROCESSED_BY_MLC: + - 'on' + - 'yes' - skip_if_env: MLC_DATASET_PREPROCESSED_BY_MLC: - 'on' @@ -87,7 +115,7 @@ variations: extra_cache_tags: openorca,preprocessed,dataset force_cache: true names: - - dae + - dae-openorca tags: download-and-extract,_rclone update_tags_from_env_with_prefix: _url.: @@ -108,3 +136,11 @@ variations: new_env_keys: - MLC_DATASET_PREPROCESSED_PATH - MLC_DATASET_OPENORCA_PREPROCESSED_PATH + - PREPROCESSED_DATA_DIR + nvidia: + group: preprocess-step-provider + env: + MLC_TMP_DATASET_PREPROCESS_STEP_PROVIDER: nvidia + nvidia,calibration: + new_env_keys: + - MLC_NVIDIA_PREPROCESSED_CALIBRATION_DATASET_PATH diff --git a/script/get-preprocessed-dataset-openorca/nvidia_preprocess.py b/script/get-preprocessed-dataset-openorca/nvidia_preprocess.py new file mode 100644 index 000000000..013aa60e7 --- /dev/null +++ b/script/get-preprocessed-dataset-openorca/nvidia_preprocess.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Script to preprocess the data for Llama2-70b.""" + +import argparse +import logging +from pathlib import Path + +import numpy as np +import pandas as pd +from datasets import Dataset + +G_MAX_INPUT_TOK_LEN = 2048 +G_LLAMA2_EOS = 2 + + +def preprocess_data(data_dir, preprocessed_data_dir): + data_dir = Path(data_dir) + preprocessed_data_dir = Path(preprocessed_data_dir) + preprocessed_data_dir.mkdir(parents=True, exist_ok=True) + + # Load inference data + inference_pkl_path = data_dir / "open_orca_gpt4_tokenized_llama.sampled_24576.pkl" + df = pd.read_pickle(inference_pkl_path) + toks = df['tok_input'].to_list() + toks_np = np.ones((len(toks), G_MAX_INPUT_TOK_LEN), + dtype=np.int32) * G_LLAMA2_EOS + tok_len_np = df['tok_input_length'].to_numpy().astype(np.int32) + + for i, q in enumerate(toks): + toks_np[i, :len(q)] = q + assert len(q) == tok_len_np[i] + + np.save(preprocessed_data_dir / "input_ids_padded.npy", toks_np) + np.save(preprocessed_data_dir / "input_lens.npy", tok_len_np) + + # Load calibration data + calib_pkl_path = data_dir / "open_orca_gpt4_tokenized_llama.calibration_1000.pkl" + calib_df = pd.read_pickle(calib_pkl_path) + + if 'input' not in calib_df.columns: + raise ValueError("The DataFrame does not contain an 'input' column.") + + hf_dataset = Dataset.from_pandas(calib_df[['input']]) + hf_dataset = hf_dataset.rename_column("input", "text") + + dataset_dir = preprocessed_data_dir / 'mlperf_llama2_openorca_calibration_1k' + dataset_dir.mkdir(parents=True, exist_ok=True) + hf_dataset.to_parquet(dataset_dir / "data.parquet") + + logging.info(f"Done preprocessing llama2 at {preprocessed_data_dir}") + + +def main(): + parser = argparse.ArgumentParser( + description="Preprocess Llama2 data for TensorRT") + parser.add_argument( + "--data_dir", "-d", + help="Path to the input open_orca pickle file", + default="build/data" + ) + parser.add_argument( + "--preprocessed_data_dir", "-o", + help="Output directory for the preprocessed data.", + default="build/preprocessed_data" + ) + args = parser.parse_args() + + preprocess_data(args.data_dir, args.preprocessed_data_dir) + + +if __name__ == "__main__": + main() diff --git a/script/get-preprocessed-dataset-openorca/run.sh b/script/get-preprocessed-dataset-openorca/run.sh index aa7be3116..b637ad585 100644 --- a/script/get-preprocessed-dataset-openorca/run.sh +++ b/script/get-preprocessed-dataset-openorca/run.sh @@ -3,3 +3,16 @@ cd ${MLC_RUN_DIR} echo "${MLC_RUN_CMD}" eval "${MLC_RUN_CMD}" +test $? -eq 0 || exit $? + +if { [ "${MLC_DATASET_PREPROCESSED_BY_MLC}" = "true" ] || \ + [ "${MLC_DATASET_PREPROCESSED_BY_MLC}" = "yes" ] || \ + [ "${MLC_DATASET_PREPROCESSED_BY_MLC}" = "1" ]; } && \ + [ "${MLC_TMP_DATASET_PREPROCESS_STEP_PROVIDER}" = "nvidia" ]; then + + ${MLC_PYTHON_BIN_WITH_PATH} ${MLC_TMP_CURRENT_SCRIPT_PATH}/nvidia_preprocess.py \ + -d ${MLC_OPENORCA_PREPROCESSED_ROOT} \ + -o ${MLC_OPENORCA_PREPROCESSED_ROOT}/preprocessed_data + test $? -eq 0 || exit $? +fi + diff --git a/script/run-docker-container/customize.py b/script/run-docker-container/customize.py index 3c40bac93..aa27dcfb6 100644 --- a/script/run-docker-container/customize.py +++ b/script/run-docker-container/customize.py @@ -179,7 +179,10 @@ def postprocess(i): if env.get('MLC_DOCKER_ADD_NUM_GPUS', '') != '': run_opts += " --gpus={}".format(env['MLC_DOCKER_ADD_NUM_GPUS']) elif env.get('MLC_DOCKER_ADD_ALL_GPUS', '') != '': - run_opts += " --gpus=all" + if env.get('MLC_CONTAINER_TOOL') == "podman": + run_opts += " --device nvidia.com/gpu=all" + else: + run_opts += " --gpus=all" if env.get('MLC_DOCKER_SHM_SIZE', '') != '': run_opts += " --shm-size={}".format(env['MLC_DOCKER_SHM_SIZE'])