diff --git a/script/app-mlperf-inference-nvidia/customize.py b/script/app-mlperf-inference-nvidia/customize.py
index eddf02252..7d5371c9c 100644
--- a/script/app-mlperf-inference-nvidia/customize.py
+++ b/script/app-mlperf-inference-nvidia/customize.py
@@ -321,15 +321,30 @@ def preprocess(i):
         # path to which the data file is present
         target_data_path = os.path.join(
             env['MLPERF_SCRATCH_PATH'],
-            'preprocessed_data',
-            'open_orca')
+            'data',
+            'llama2-70b')
         # path to the dataset file
         target_data_file_path = os.path.join(
+            env['MLPERF_SCRATCH_PATH'],
+            'data',
+            'llama2-70b',
+            'open_orca_gpt4_tokenized_llama.sampled_24576.pkl')
+
+        preprocessed_data_for_accuracy_checker = os.path.join(
             env['MLPERF_SCRATCH_PATH'],
             'preprocessed_data',
             'open_orca',
             'open_orca_gpt4_tokenized_llama.sampled_24576.pkl')
+
+        if not env.get('LLAMA2_PRE_QUANTIZED_CHECKPOINT_PATH'):
+            target_calibration_data_file_path = os.path.join(
+                env['MLPERF_SCRATCH_PATH'],
+                'data',
+                'llama2-70b',
+                'open_orca_gpt4_tokenized_llama.calibration_1000.pkl')
+
         tmp_tp_size = env['MLC_NVIDIA_TP_SIZE']
+        tmp_pp_size = env['MLC_NVIDIA_PP_SIZE']
         if tmp_tp_size == "1":
             fp8_model_path = os.path.join(
                 env['MLPERF_SCRATCH_PATH'],
@@ -343,15 +358,35 @@ def preprocess(i):
                 'models',
                 'Llama2',
                 'fp8-quantized-ammo',
-                f'llama2-70b-chat-hf-tp{tmp_tp_size}pp1-fp8')
+                f'llama2-70b-chat-hf-tp{tmp_tp_size}pp{tmp_pp_size}-fp8')
+
+        # check the presence of validation dataset
         if not os.path.exists(target_data_file_path):
-            if env.get('MLC_NVIDIA_LLAMA_DATASET_FILE_PATH', '') == '':
+            if env.get('MLC_DATASET_OPENORCA_PREPROCESSED_PATH', '') == '':
                 return {
-                    'return': 1, 'error': 'Please specify the path to LLAMA2 dataset (pickle file)'}
+                    'return': 1, 'error': 'Llama2 70B validation dataset not present.'}
             if not os.path.exists(target_data_path):
-                cmds.append(f"mkdir {target_data_path}")
+                cmds.append(f"mkdir -p {target_data_path}")
             cmds.append(
-                f"ln -sf {env['MLC_NVIDIA_LLAMA_DATASET_FILE_PATH']} {target_data_file_path}")
+                f"ln -sf {env['MLC_DATASET_OPENORCA_PREPROCESSED_PATH']} {target_data_file_path}")
+
+        # check the presence of calibration dataset
+        if not env.get('LLAMA2_PRE_QUANTIZED_CHECKPOINT_PATH'):
+            if not os.path.exists(target_calibration_data_file_path):
+                if env.get('MLC_DATASET_OPENORCA_CALIBRATION_PATH', '') == '':
+                    return {
+                        'return': 1, 'error': 'Llama2 70B calibration dataset not present.'}
+                if not os.path.exists(target_data_path):
+                    cmds.append(f"mkdir -p {target_data_path}")
+                cmds.append(
+                    f"ln -sf {env['MLC_DATASET_OPENORCA_CALIBRATION_PATH']} {target_calibration_data_file_path}")
+
+        if not os.path.exists(preprocessed_data_for_accuracy_checker):
+            if not os.path.exists(preprocessed_data_for_accuracy_checker):
+                cmds.append(
+                    f"mkdir -p {os.path.dirname(preprocessed_data_for_accuracy_checker)}")
+            cmds.append(
+                f"ln -sf {env['MLC_DATASET_OPENORCA_PREPROCESSED_PATH']} {preprocessed_data_for_accuracy_checker}")
 
         model_name = "llama2-70b"
         model_path = fp8_model_path
@@ -550,6 +585,11 @@ def preprocess(i):
         if gpu_inference_streams:
             run_config += f" --gpu_inference_streams={gpu_inference_streams}"
 
+        model_precision = env.get(
+            'MLC_MLPERF_MODEL_PRECISION').replace('float', 'fp')
+        if model_precision:
+            run_config += f" --precision={model_precision}"
+
         dla_copy_streams = env.get(
             'MLC_MLPERF_NVIDIA_HARNESS_DLA_COPY_STREAMS')
         if dla_copy_streams:
@@ -688,8 +728,12 @@ def preprocess(i):
             run_config += f" --use_fp8"
 
         if "llama2" in env["MLC_MODEL"]:
-            run_config += f" --fp8_quant_model_path={fp8_model_path}"
-            run_config += f" --tensor_parallelism={tmp_tp_size}"
+            run_config += f" --checkpoint_dir={fp8_model_path}"
+            if env.get('MLC_MLPERF_INFERENCE_POST_5_0'):
+                run_config += f" --trtllm_build_flags=tensor_parallelism:{tmp_tp_size},pipeline_parallelism:{tmp_pp_size}"
+            else:
+                run_config += f" --tensor_parallelism={tmp_tp_size}"
+                run_config += f" --pipeline_parallelism={tmp_pp_size}"
 
         enable_sort = env.get('MLC_MLPERF_NVIDIA_HARNESS_ENABLE_SORT')
         if enable_sort and not is_false(enable_sort):
@@ -757,9 +801,11 @@ def preprocess(i):
         hpcx_paths.append("/opt/hpcx/ucx/lib")
     if os.path.exists("/opt/hpcx/ucc/lib"):
         hpcx_paths.append("/opt/hpcx/ucc/lib")
+    if os.path.exists("/opt/hpcx/ompi/lib"):
+        hpcx_paths.append("/opt/hpcx/ompi/lib")
 
     env['+LD_LIBRARY_PATH'] = hpcx_paths + env['+LD_LIBRARY_PATH']
-
+    env['+PYTHONPATH'] = []
     #    print(env)
 
     return {'return': 0}
diff --git a/script/app-mlperf-inference-nvidia/meta.yaml b/script/app-mlperf-inference-nvidia/meta.yaml
index 6dda8d4a5..479020929 100644
--- a/script/app-mlperf-inference-nvidia/meta.yaml
+++ b/script/app-mlperf-inference-nvidia/meta.yaml
@@ -355,6 +355,8 @@ variations:
     group: batchsize-format-change
   v5.0+:
     group: batchsize-format-change
+    env:
+      MLC_MLPERF_INFERENCE_POST_5_0: "yes"
   v5.0:
     base:
       - v5.0+
@@ -1279,13 +1281,17 @@ variations:
       MLC_MLPERF_NVIDIA_HARNESS_NUM_SORT_SEGMENTS: '2'
       MLC_MLPERF_NVIDIA_HARNESS_SKIP_POSTPROCESS: True
 
-  gpu_memory.80,pre5.0,num-gpus.2,llama2-70b,offline,run_harness:
+  gpu_memory.80,pre5.0,num-gpus.2,llama2-70b_,offline,run_harness:
     default_variations:
       batch-size: batch_size.896
 
-  gpu_memory.80,v5.0+,num-gpus.2,llama2-70b,offline,run_harness:
+  gpu_memory.80,v5.0+,num-gpus.2,llama2-70b_,offline,run_harness:
     default_variations:
       batch-size: batch_size."llama2-70b:1024"
+
+  gpu_memory.80,v5.0+,num-gpus.8,llama2-70b_,offline,run_harness:
+    default_variations:
+      batch-size: batch_size."llama2-70b:4096"
       
   gpu_memory.16,pre5.0,gptj_,offline,run_harness:
     default_variations:
diff --git a/script/app-mlperf-inference/meta.yaml b/script/app-mlperf-inference/meta.yaml
index f9ad1283f..a0752e8fd 100644
--- a/script/app-mlperf-inference/meta.yaml
+++ b/script/app-mlperf-inference/meta.yaml
@@ -28,6 +28,8 @@ default_env:
   MLC_TEST_QUERY_COUNT: '10'
   MLC_MLPERF_QUANTIZATION: off
   MLC_GET_PLATFORM_DETAILS: no
+  MLC_NVIDIA_TP_SIZE: "2"
+  MLC_NVIDIA_PP_SIZE: "1"
 
 env:
   MLC_MLPERF_PRINT_SUMMARY: "no"
@@ -62,8 +64,8 @@ input_mapping:
   readme: MLC_MLPERF_README
   debug: MLC_DEBUG_SCRIPT_BENCHMARK_PROGRAM
   gpu_name: MLC_NVIDIA_GPU_NAME
-  nvidia_llama2_dataset_file_path: MLC_NVIDIA_LLAMA_DATASET_FILE_PATH
   tp_size: MLC_NVIDIA_TP_SIZE
+  pp_size: MLC_NVIDIA_PP_SIZE
   use_dataset_from_host: MLC_USE_DATASET_FROM_HOST
 
 predeps: False
@@ -324,9 +326,21 @@ variations:
             MLC_MLPERF_NVIDIA_SKIP_GPTJ:
               - "yes"
         - tags: get,ml-model,llama2-70b,_nvidia,_fp8
+          names:
+            - llama2-model  
           update_tags_from_env_with_prefix:
             _tp-size.:
               - MLC_NVIDIA_TP_SIZE
+            _pp-size.:
+              - MLC_NVIDIA_PP_SIZE
+          skip_if_env:
+            MLC_MLPERF_NVIDIA_SKIP_LLAMA2_70B:
+              - "yes"
+        - tags: get,dataset,preprocessed,openorca,_calibration,_mlcommons,_nvidia
+          skip_if_env:
+            MLC_MLPERF_NVIDIA_SKIP_LLAMA2_70B:
+              - "yes"
+        - tags: get,dataset,preprocessed,openorca,_validation,_mlcommons,_nvidia
           skip_if_env:
             MLC_MLPERF_NVIDIA_SKIP_LLAMA2_70B:
               - "yes"
@@ -505,9 +519,15 @@ variations:
       image_name: mlperf-inference-nvidia-v4.1-dev-llm
       deps:
         - tags: get,ml-model,llama2-70b,_nvidia,_fp8
+          names:
+            - llama2-model  
           update_tags_from_env_with_prefix:
             _tp-size.:
               - MLC_NVIDIA_TP_SIZE
+            _pp-size.:
+              - MLC_NVIDIA_PP_SIZE
+        - tags: get,dataset,preprocessed,openorca,_calibration,_mlcommons,_nvidia
+        - tags: get,dataset,preprocessed,openorca,_validation,_mlcommons,_nvidia
     env:
       BUILD_TRTLLM: 1
 
@@ -515,9 +535,15 @@ variations:
     docker:
       deps:
         - tags: get,ml-model,llama2-70b,_nvidia,_fp8
+          names:
+            - llama2-model  
           update_tags_from_env_with_prefix:
             _tp-size.:
               - MLC_NVIDIA_TP_SIZE
+            _pp-size.:
+              - MLC_NVIDIA_PP_SIZE
+        - tags: get,dataset,preprocessed,openorca,_calibration,_mlcommons,_nvidia
+        - tags: get,dataset,preprocessed,openorca,_validation,_mlcommons,_nvidia
     env:
       BUILD_TRTLLM: 1
   
@@ -525,9 +551,33 @@ variations:
     docker:
       deps:
         - tags: get,ml-model,llama2-70b,_nvidia,_fp8
+          names:
+            - llama2-model  
+          update_tags_from_env_with_prefix:
+            _tp-size.:
+              - MLC_NVIDIA_TP_SIZE
+            _pp-size.:
+              - MLC_NVIDIA_PP_SIZE
+        - tags: get,dataset,preprocessed,openorca,_calibration,_mlcommons,_nvidia
+        - tags: get,dataset,preprocessed,openorca,_validation,_mlcommons,_nvidia
+
+  nvidia-original,r5.1-dev_default,llama2-70b_:
+    default_variations:
+      precision: float8
+    docker:
+      deps:
+        - tags: get,ml-model,llama2-70b,_nvidia,_fp8,_v5.0
+          names:
+            - llama2-model  
           update_tags_from_env_with_prefix:
             _tp-size.:
               - MLC_NVIDIA_TP_SIZE
+            _pp-size.:
+              - MLC_NVIDIA_PP_SIZE
+        - tags: get,dataset,preprocessed,openorca,_calibration,_mlcommons,_nvidia
+        - tags: get,dataset,preprocessed,openorca,_validation,_mlcommons,_nvidia
+      env:
+        BUILD_TRTLLM: 1
 
   nvidia-original:
     docker:
@@ -594,6 +644,8 @@ variations:
         update_tags_from_env_with_prefix:
           "_gpu_memory." :
             - MLC_NVIDIA_GPU_MEMORY
+          "_num-gpus.":
+            - MLC_CUDA_NUM_DEVICES
         update_tags_from_env:
           - MLC_NVIDIA_HARNESS_GPU_VARIATION
 
@@ -1293,6 +1345,16 @@ variations:
             MLC_USE_MODEL_FROM_HOST:
             - 'yes'
           tags: get,ml-model,llama2
+          names:
+            - llama2-model  
+        - tags: get,dataset,preprocessed,openorca,_calibration,_mlcommons
+          enable_if_any_env:
+            MLC_USE_DATASET_FROM_HOST:
+            - 'yes'
+        - tags: get,dataset,preprocessed,openorca,_validation,_mlcommons
+          enable_if_any_env:
+            MLC_USE_DATASET_FROM_HOST:
+            - 'yes'
 
   llama2-70b_,amd:
     docker:
@@ -1306,6 +1368,8 @@ variations:
             MLC_USE_MODEL_FROM_HOST:
             - 'yes'
           tags: get,ml-model,llama2,_amd,_pytorch
+          names:
+            - llama2-model  
   
   mixtral-8x7b:
     group:
@@ -1830,6 +1894,12 @@ variations:
   fp32:
     alias: float32
 
+  fp4:
+    alias: float4
+
+  fp8:
+    alias: float8
+
   float32:
     group: precision
     default: true
@@ -1842,6 +1912,16 @@ variations:
       kilt-harness:
         tags: _fp32
 
+  float4:
+    group: precision
+    env:
+      MLC_MLPERF_MODEL_PRECISION: float4
+
+  float8:
+    group: precision
+    env:
+      MLC_MLPERF_MODEL_PRECISION: float8
+      
   float16:
     group: precision
     env:
@@ -2128,10 +2208,10 @@ variations:
       reproducibility
     add_deps_recursive:
       nvidia-inference-common-code:
-        tags: _custom,_v5.1-dev
+        tags: _mlcommons,_v5.1-dev
       nvidia-inference-server:
         version: r5.0
-        tags: _custom
+        tags: _mlcommons
       nvidia-harness:   
         tags: _v5.0
       intel-harness:
@@ -2285,6 +2365,9 @@ docker:
    - "${{ GPTJ_CHECKPOINT_PATH }}:${{ GPTJ_CHECKPOINT_PATH }}"
    - "${{ MLC_CRITEO_PREPROCESSED_PATH }}:${{ MLC_CRITEO_PREPROCESSED_PATH }}"
    - "${{ LLAMA2_CHECKPOINT_PATH }}:${{ LLAMA2_CHECKPOINT_PATH }}"
+   - "${{ LLAMA2_PRE_QUANTIZED_CHECKPOINT_PATH }}:${{ LLAMA2_PRE_QUANTIZED_CHECKPOINT_PATH }}"
+   - "${{ MLC_DATASET_OPENORCA_PREPROCESSED_PATH }}:${{ MLC_DATASET_OPENORCA_PREPROCESSED_PATH }}"
+   - "${{ MLC_DATASET_OPENORCA_CALIBRATION_PATH }}:${{ MLC_DATASET_OPENORCA_CALIBRATION_PATH }}"
    - "${{ MLC_NVIDIA_LLAMA_DATASET_FILE_PATH }}:${{ MLC_NVIDIA_LLAMA_DATASET_FILE_PATH }}"
    - "${{ SDXL_CHECKPOINT_PATH }}:${{ SDXL_CHECKPOINT_PATH }}"
    - "${{ MLC_DATASET_KITS19_PREPROCESSED_PATH }}:${{ MLC_DATASET_KITS19_PREPROCESSED_PATH }}"
@@ -2314,3 +2397,4 @@ docker:
     intel_gptj_int8_model_path: MLC_MLPERF_INFERENCE_INTEL_GPTJ_INT8_MODEL_PATH
     nvidia_llama2_dataset_file_path: MLC_NVIDIA_LLAMA_DATASET_FILE_PATH
     tp_size: MLC_NVIDIA_TP_SIZE
+    pp_size: MLC_NVIDIA_PP_SIZE
diff --git a/script/build-mlperf-inference-server-nvidia/customize.py b/script/build-mlperf-inference-server-nvidia/customize.py
index 5fa70aa45..f6a2f399f 100644
--- a/script/build-mlperf-inference-server-nvidia/customize.py
+++ b/script/build-mlperf-inference-server-nvidia/customize.py
@@ -1,6 +1,7 @@
 from mlc import utils
 import os
 import shutil
+from utils import *
 
 
 def preprocess(i):
@@ -18,6 +19,15 @@ def preprocess(i):
         env['+LIBRARY_PATH'].append(os.path.join(
             env['MLC_TENSORRT_INSTALL_PATH'], "lib"))
 
+    if is_true(env.get('BUILD_TRTLLM')):
+        hpcx_paths = []
+        if os.path.exists("/opt/hpcx/ucx/lib"):
+            hpcx_paths.append("/opt/hpcx/ucx/lib")
+        if os.path.exists("/opt/hpcx/ucc/lib"):
+            hpcx_paths.append("/opt/hpcx/ucc/lib")
+        if os.path.exists("/opt/hpcx/ompi/lib"):
+            hpcx_paths.append("/opt/hpcx/ompi/lib")
+
     cxxflags = [
         "-Wno-error=switch",
         "-DDALI_1_15=1",
@@ -38,6 +48,8 @@ def preprocess(i):
         env['+ CXXFLAGS'] = []
 
     env['+ CXXFLAGS'] += cxxflags
+    env['+LD_LIBRARY_PATH'] = hpcx_paths + env['+LD_LIBRARY_PATH']
+    env['+PYTHONPATH'] = []
     return {'return': 0}
 
 
diff --git a/script/build-mlperf-inference-server-nvidia/meta.yaml b/script/build-mlperf-inference-server-nvidia/meta.yaml
index 8c57fd809..8b1a2a0e5 100644
--- a/script/build-mlperf-inference-server-nvidia/meta.yaml
+++ b/script/build-mlperf-inference-server-nvidia/meta.yaml
@@ -78,7 +78,7 @@ deps:
 
   # Detect MLCake
   - tags: get,cmake
-    version: "3.25.1"
+    version: "3.26.4"
 
   # Detect Google Logger
   - tags: get,generic,sys-util,_glog-dev
@@ -101,6 +101,16 @@ deps:
   # Detect rapidjson-dev
   - tags: get,generic,sys-util,_rapidjson-dev
 
+  - tags: get,generic,sys-util,_ccache
+    enable_if_env:
+      BUILD_TRTLLM: 
+        - 'yes'
+
+  - tags: get,generic,sys-util,_git-lfs
+    enable_if_env:
+      BUILD_TRTLLM: 
+        - 'yes'
+
 
   # Download Nvidia Submission Code
   - tags: get,nvidia,mlperf,inference,common-code
diff --git a/script/get-generic-sys-util/meta.yaml b/script/get-generic-sys-util/meta.yaml
index cbb1bd9fc..77c48ef98 100644
--- a/script/get-generic-sys-util/meta.yaml
+++ b/script/get-generic-sys-util/meta.yaml
@@ -314,6 +314,20 @@ variations:
         brew: git-lfs
         dnf: git-lfs
         yum: git-lfs
+  ccache:
+    env:
+      MLC_SYS_UTIL_NAME: ccache
+      MLC_SYS_UTIL_VERSION_CMD: ccache --version
+      MLC_SYS_UTIL_VERSION_RE: "ccache version ([0-9.]+)"
+      MLC_TMP_VERSION_DETECT_GROUP_NUMBER: 0
+    new_env_keys:
+    - MLC_CCACHE_VERSION
+    state:
+      ccache:
+        apt: ccache
+        brew: ccache
+        dnf: ccache
+        yum: ccache
   glog-dev:
     env:
       MLC_SYS_UTIL_NAME: glog-dev
diff --git a/script/get-git-repo/run.sh b/script/get-git-repo/run.sh
index 0e0c19324..2d914ae84 100644
--- a/script/get-git-repo/run.sh
+++ b/script/get-git-repo/run.sh
@@ -64,7 +64,7 @@ if [ ! -z ${MLC_GIT_PR_TO_APPLY} ]; then
   git fetch origin ${MLC_GIT_PR_TO_APPLY}:tmp-apply
 fi
 
-IFS=',' read -r -a cherrypicks <<< "${MLC_GIT_CHERRYPICKS}"
+IFS=';' read -r -a cherrypicks <<< "${MLC_GIT_CHERRYPICKS}"
 for cherrypick in "${cherrypicks[@]}"
 do
   echo ""
@@ -73,18 +73,18 @@ do
   test $? -eq 0 || exit $?
 done
 
-IFS=',' read -r -a submodules <<< "${MLC_GIT_SUBMODULES}"
+IFS=';' read -r -a submodules <<< "${MLC_GIT_SUBMODULES}"
 
 for submodule in "${submodules[@]}"
 do
     echo ""
     echo "Initializing submodule ${submodule}"
-    git submodule update --init "${submodule}"
+    git submodule update --init --recursive --checkout --force "${submodule}"
     test $? -eq 0 || exit $?
 done
 
 if [ ${MLC_GIT_PATCH} == "yes" ]; then
-  IFS=', ' read -r -a patch_files <<< ${MLC_GIT_PATCH_FILEPATHS}
+  IFS=';' read -r -a patch_files <<< ${MLC_GIT_PATCH_FILEPATHS}
   for patch_file in "${patch_files[@]}"
   do
     echo ""
diff --git a/script/get-ml-model-llama2/customize.py b/script/get-ml-model-llama2/customize.py
index 491e76248..3a5e43232 100644
--- a/script/get-ml-model-llama2/customize.py
+++ b/script/get-ml-model-llama2/customize.py
@@ -1,4 +1,5 @@
 from mlc import utils
+from utils import is_true
 import os
 
 
@@ -8,13 +9,17 @@ def preprocess(i):
     env = i['env']
 
     if env.get('MLC_TMP_ML_MODEL_PROVIDER', '') == 'nvidia':
-        i['run_script_input']['script_name'] = 'run-nvidia'
-        gpu_arch = int(
-            float(
-                env['MLC_CUDA_DEVICE_PROP_GPU_COMPUTE_CAPABILITY']) *
-            10)
-        env['MLC_GPU_ARCH'] = gpu_arch
-        env['MLC_TMP_REQUIRE_DOWNLOAD'] = 'no'
+        if is_true(env.get('MLC_ML_MODEL_QUANTIZE_LOCALLY')):
+            i['run_script_input']['script_name'] = 'run-nvidia'
+            gpu_arch = int(
+                float(
+                    env['MLC_CUDA_DEVICE_PROP_GPU_COMPUTE_CAPABILITY']) *
+                10)
+            env['MLC_GPU_ARCH'] = gpu_arch
+            env['MLC_TMP_REQUIRE_DOWNLOAD'] = 'no'
+        else:
+            run_cmd = f"ln -sf {env['LLAMA2_CHECKPOINT_PATH']} {env['MLC_NVIDIA_MLPERF_SCRATCH_PATH']}/models/Llama2/fp8-quantized-ammo/llama-2-70b-chat-hf-tp{env['MLC_NVIDIA_TP_SIZE']}pp{env['MLC_NVIDIA_PP_SIZE']}-{env['MLC_ML_MODEL_PRECISION']}"
+            env['MLC_RUN_CMD'] = run_cmd
     else:
         path = env.get('LLAMA2_CHECKPOINT_PATH', '').strip()
 
@@ -37,11 +42,14 @@ def postprocess(i):
 
     env = i['env']
     if env.get('MLC_DOWNLOAD_MODE', '') != "dry":
-        if env.get('LLAMA2_CHECKPOINT_PATH', '') == '':
-            env['LLAMA2_CHECKPOINT_PATH'] = env['MLC_ML_MODEL_PATH']
+        if is_true(env.get('MLC_ML_MODEL_QUANTIZE_LOCALLY')):
+            if env.get('LLAMA2_CHECKPOINT_PATH', '') == '':
+                env['LLAMA2_CHECKPOINT_PATH'] = env['MLC_ML_MODEL_PATH']
+            else:
+                env['MLC_ML_MODEL_PATH'] = env['LLAMA2_CHECKPOINT_PATH']
+            env['MLC_ML_MODEL_LLAMA2_FILE_WITH_PATH'] = env['LLAMA2_CHECKPOINT_PATH']
+            env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_ML_MODEL_PATH']
         else:
-            env['MLC_ML_MODEL_PATH'] = env['LLAMA2_CHECKPOINT_PATH']
-        env['MLC_ML_MODEL_LLAMA2_FILE_WITH_PATH'] = env['LLAMA2_CHECKPOINT_PATH']
-        env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_ML_MODEL_PATH']
+            env['LLAMA2_PRE_QUANTIZED_CHECKPOINT_PATH'] = env['LLAMA2_CHECKPOINT_PATH']
 
     return {'return': 0}
diff --git a/script/get-ml-model-llama2/meta.yaml b/script/get-ml-model-llama2/meta.yaml
index a2b0c3c1e..45895030f 100644
--- a/script/get-ml-model-llama2/meta.yaml
+++ b/script/get-ml-model-llama2/meta.yaml
@@ -15,6 +15,7 @@ new_env_keys:
 - LLAMA2_CHECKPOINT_PATH
 - MLC_NVIDIA_TP_SIZE
 - MLC_LLAMA2_FINAL_SAFE_TENSORS_PATH
+- LLAMA2_PRE_QUANTIZED_CHECKPOINT_PATH
 prehook_deps:
 - enable_if_env:
     MLC_TMP_REQUIRE_DOWNLOAD:
@@ -48,10 +49,15 @@ tests:
     - r2-downloader,70b,mlc,dry-run
     - r2-downloader,7b,mlc,dry-run
 variations:
-  L40s:
+  pre-quantized:
+    group: quantization
     env:
-      MLC_NVIDIA_TP_SIZE: 4
-    group: gpu
+      MLC_ML_MODEL_PRE_QUANTIZED: 'yes'
+  quantize-locally:
+    default: true
+    group: quantization
+    env:
+      MLC_ML_MODEL_QUANTIZE_LOCALLY: 'yes'
   amd:
     default_env:
       MLC_LLAMA2_QUANTIZATION_DEVICE: ''
@@ -80,10 +86,6 @@ variations:
       MLC_ML_MODEL_PRECISION: fp8
       MLC_ML_MODEL_WEIGHT_DATA_TYPES: fp8
     group: precision
-  generic:
-    env:
-      MLC_NVIDIA_TP_SIZE: 2
-    group: gpu
   int8:
     env:
       MLC_ML_MODEL_INPUT_DATA_TYPES: int8
@@ -142,18 +144,21 @@ variations:
         update_tags_from_env_with_prefix:
           _url.:
             - MLC_DOWNLOAD_URL
-  mlc,rclone,70b:
+  mlc,rclone,70b,quantize-locally:
     env:
       MLC_DOWNLOAD_URL: mlc-llama2:Llama-2-70b-chat-hf
-  mlc,rclone,7b:
+  mlc,rclone,7b,quantize-locally:
     env:
       MLC_DOWNLOAD_URL: mlc-llama2:Llama-2-7b-chat-hf
-  mlc,r2-downloader,70b:
+  mlc,r2-downloader,70b,quantize-locally:
     env:
       MLC_DOWNLOAD_URL: https://llama2.mlcommons-storage.org/metadata/llama-2-70b-chat-hf.uri
-  mlc,r2-downloader,7b:
+  mlc,r2-downloader,7b,quantize-locally:
     env:
       MLC_DOWNLOAD_URL: https://llama2.mlcommons-storage.org/metadata/llama-2-7b-chat-hf.uri
+  mlc,r2-downloader,70b,pre-quantized,fp8:
+    env:
+      MLC_DOWNLOAD_URL: https://llama2.mlcommons-storage.org/metadata/llama-2-70b-chat-hf-tp<<MLC_NVIDIA_TP_SIZE>>pp<<MLC_NVIDIA_PP_SIZE>>-<<<MLC_ML_MODEL_PRECISION>>>.uri
   hf:
     group: download-source
     env:
@@ -203,6 +208,10 @@ variations:
       framework: pytorch
     env:
       MLC_TMP_ML_MODEL_PROVIDER: nvidia
+    deps:
+      - tags: get,nvidia,scratch,space
+        names:
+        - mlperf-inference-nvidia-scratch-space
     group: model-provider
   pytorch:
     default: true
@@ -233,19 +242,21 @@ variations:
     - tags: get,generic-python-lib,_package.compressed_tensors
   pytorch,fp32:
     env: {}
-  pytorch,nvidia:
+  pytorch,nvidia,v5.0:
+    deps:
+      - env:
+          MLC_GIT_CHECKOUT_PATH_ENV_NAME: MLC_TENSORRT_LLM_CHECKOUT_PATH
+        extra_cache_tags: tensorrt-llm
+        tags: get,git,repo,_repo.https://github.com/NVIDIA/TensorRT-LLM.git,_sha.2ea17cdad28bed0f30e80eea5b1380726a7c6493,_submodules.3rdparty/NVTX;3rdparty/cutlass;3rdparty/cxxopts;3rdparty/json;3rdparty/pybind11;3rdparty/ucxx;3rdparty/xgrammar
+  pytorch,nvidia,quantize-locally:
     default_variations:
-      gpu: generic
       precision: fp8
+      tp-size: tp-size.2
+      pp-size: pp-size.1
     deps:
-    - env:
-        MLC_GIT_CHECKOUT_PATH_ENV_NAME: MLC_TENSORRT_LLM_CHECKOUT_PATH
-      extra_cache_tags: tensorrt-llm
-      tags: get,git,repo,_repo.https://github.com/NVIDIA/TensorRT-LLM.git,_sha.0ab9d17a59c284d2de36889832fe9fc7c8697604
     - names:
       - cuda
       tags: get,cuda
-    - tags: get,nvidia,scratch,space
     - tags: get,cuda-devices,_with-pycuda
     - env: {}
       force_new_env_keys:
@@ -254,6 +265,7 @@ variations:
     - names:
       - nvidia-inference-common-code
       tags: get,nvidia,inference,common-code
+    - tags: get,preprocessed,dataset,openorca,_calibration,_mlc,_nvidia
     - names:
       - python
       - python3
@@ -268,10 +280,17 @@ variations:
   tp-size.#:
     env:
       MLC_NVIDIA_TP_SIZE: '#'
-    group: gpu
+    group: tp-size
+  pp-size.#:
+    env:
+      MLC_NVIDIA_PP_SIZE: '#'
+    group: pp-size
   uint8:
     env:
       MLC_ML_MODEL_INPUT_DATA_TYPES: uint8
       MLC_ML_MODEL_PRECISION: uint8
       MLC_ML_MODEL_WEIGHT_DATA_TYPES: uint8
     group: precision
+  v5.0:
+    group: version
+    default: true
diff --git a/script/get-ml-model-llama2/run-nvidia.sh b/script/get-ml-model-llama2/run-nvidia.sh
index d38e911cb..ffabbe4c6 100644
--- a/script/get-ml-model-llama2/run-nvidia.sh
+++ b/script/get-ml-model-llama2/run-nvidia.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
 
 echo "Set tp size is ${MLC_NVIDIA_TP_SIZE}"
+echo "Set pp size is ${MLC_NVIDIA_PP_SIZE}"
 
 if [[ ! -e ${MLC_NVIDIA_MLPERF_SCRATCH_PATH}/models/Llama2/Llama-2-70b-chat-hf ]]; then
   mkdir -p ${MLC_NVIDIA_MLPERF_SCRATCH_PATH}/models/Llama2/Llama-2-70b-chat-hf
   cd ${LLAMA2_CHECKPOINT_PATH}
   cp -r ${LLAMA2_CHECKPOINT_PATH}/* ${MLC_NVIDIA_MLPERF_SCRATCH_PATH}/models/Llama2/Llama-2-70b-chat-hf
-  test $? -eq 0 || exit $?
 fi
 
 echo "cd ${MLC_TENSORRT_LLM_CHECKOUT_PATH}"
@@ -16,11 +16,14 @@ make -C docker build
 test $? -eq 0 || exit $?
 
 if [ "${MLC_NVIDIA_TP_SIZE}" -eq 1 ]; then
-  RUN_CMD="bash -c 'python3 scripts/build_wheel.py -a=${MLC_GPU_ARCH} --clean --install --trt_root /usr/local/tensorrt/ && python examples/quantization/quantize.py --dtype=float16  --output_dir=/mnt/models/Llama2/fp8-quantized-ammo/llama2-70b-chat-hf-tp${MLC_NVIDIA_TP_SIZE}pp1-fp8-02072024 --model_dir=/mnt/models/Llama2/Llama-2-70b-chat-hf --qformat=fp8 --kv_cache_dtype=fp8 --tp_size ${MLC_NVIDIA_TP_SIZE}'"
+  RUN_CMD="bash -c 'git lfs install && git lfs pull && python3 scripts/build_wheel.py -a=${MLC_GPU_ARCH} --clean --install --use_ccache --benchmarks --trt_root /usr/local/tensorrt/ && python examples/quantization/quantize.py --dtype=float16  --output_dir=/mnt/models/Llama2/fp8-quantized-ammo/llama2-70b-chat-hf-tp${MLC_NVIDIA_TP_SIZE}pp${MLC_NVIDIA_PP_SIZE}-fp8-02072024 --model_dir=/mnt/models/Llama2/Llama-2-70b-chat-hf --qformat=fp8 --kv_cache_dtype=fp8 --tp_size ${MLC_NVIDIA_TP_SIZE} --pp_size ${MLC_NVIDIA_PP_SIZE} --calib_dataset=/calib_dataset'"
+  echo "$RUN_CMD"
 else
-  RUN_CMD="bash -c 'python3 scripts/build_wheel.py -a=${MLC_GPU_ARCH} --clean --install --trt_root /usr/local/tensorrt/ && python examples/quantization/quantize.py --dtype=float16  --output_dir=/mnt/models/Llama2/fp8-quantized-ammo/llama2-70b-chat-hf-tp${MLC_NVIDIA_TP_SIZE}pp1-fp8 --model_dir=/mnt/models/Llama2/Llama-2-70b-chat-hf --qformat=fp8 --kv_cache_dtype=fp8 --tp_size ${MLC_NVIDIA_TP_SIZE}'"
+  RUN_CMD="bash -c 'git lfs install && git lfs pull && python3 scripts/build_wheel.py -a=${MLC_GPU_ARCH} --clean --install --use_ccache --benchmarks --trt_root /usr/local/tensorrt/ && python examples/quantization/quantize.py --dtype=float16  --output_dir=/mnt/models/Llama2/fp8-quantized-ammo/llama2-70b-chat-hf-tp${MLC_NVIDIA_TP_SIZE}pp${MLC_NVIDIA_PP_SIZE}-fp8 --model_dir=/mnt/models/Llama2/Llama-2-70b-chat-hf --qformat=fp8 --kv_cache_dtype=fp8 --tp_size ${MLC_NVIDIA_TP_SIZE} --pp_size ${MLC_NVIDIA_PP_SIZE} --calib_dataset=/calib_dataset'"
+  echo "$RUN_CMD"
 fi
-DOCKER_RUN_ARGS=" -v ${MLC_NVIDIA_MLPERF_SCRATCH_PATH}:/mnt"
+# TODO: check whether --device nvidia.com/gpu=all would work for docker
+DOCKER_RUN_ARGS=" -v ${MLC_NVIDIA_MLPERF_SCRATCH_PATH}:/mnt -v ${MLC_NVIDIA_PREPROCESSED_CALIBRATION_DATASET_PATH}:/calib_dataset -u $(id -u):$(id -g) --userns=keep-id --device nvidia.com/gpu=all -e NVIDIA_VISIBLE_DEVICES=all"
 export DOCKER_RUN_ARGS="$DOCKER_RUN_ARGS"
 export RUN_CMD="$RUN_CMD"
 make -C docker run LOCAL_USER=1
diff --git a/script/get-ml-model-llama2/run.sh b/script/get-ml-model-llama2/run.sh
new file mode 100644
index 000000000..5c87d5618
--- /dev/null
+++ b/script/get-ml-model-llama2/run.sh
@@ -0,0 +1,3 @@
+echo "${MLC_RUN_CMD}"
+eval ${MLC_RUN_CMD}
+test $? -eq 0 || exit $?
\ No newline at end of file
diff --git a/script/get-preprocessed-dataset-openorca/customize.py b/script/get-preprocessed-dataset-openorca/customize.py
index daeef32f4..a61128618 100644
--- a/script/get-preprocessed-dataset-openorca/customize.py
+++ b/script/get-preprocessed-dataset-openorca/customize.py
@@ -9,21 +9,16 @@ def preprocess(i):
     env = i['env']
 
     if is_true(str(env.get('MLC_DATASET_PREPROCESSED_BY_MLC', ''))):
-        run_dir = os.getcwd()
-        if is_true(env.get('MLC_DATASET_CALIBRATION', '')):
-            env['MLC_DATASET_CALIBRATION_PATH'] = os.path.join(
-                env['MLC_OPENORCA_PREPROCESSED_ROOT'],
-                "open_orca_gpt4_tokenized_llama.calibration_1000.pkl.gz")
-            env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_DATASET_CALIBRATION_PATH']
-            env['MLC_DATASET_OPENORCA_CALIBRATION_PATH'] = env['MLC_DATASET_CALIBRATION_PATH']
-        else:
-            env['MLC_DATASET_PREPROCESSED_PATH'] = os.path.join(
-                env['MLC_OPENORCA_PREPROCESSED_ROOT'],
-                "open_orca_gpt4_tokenized_llama.sampled_24576.pkl.gz")
-            env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_DATASET_PREPROCESSED_PATH']
-            env['MLC_DATASET_OPENORCA_PREPROCESSED_PATH'] = env['MLC_DATASET_PREPROCESSED_PATH']
-        # run_cmd = f"gunzip -k {env['MLC_DATASET_PREPROCESSED_PATH']}"
+        run_dir = env['MLC_OPENORCA_PREPROCESSED_ROOT']
         run_cmd = ''
+        env['MLC_DATASET_CALIBRATION_PATH'] = os.path.join(
+            env['MLC_OPENORCA_PREPROCESSED_ROOT'],
+            "open_orca_gpt4_tokenized_llama.calibration_1000.pkl.gz")
+        run_cmd = f"gzip -dkf {env['MLC_DATASET_CALIBRATION_PATH']}"
+        env['MLC_DATASET_PREPROCESSED_PATH'] = os.path.join(
+            env['MLC_OPENORCA_PREPROCESSED_ROOT'],
+            "open_orca_gpt4_tokenized_llama.sampled_24576.pkl.gz")
+        run_cmd += f" && gzip -dkf {env['MLC_DATASET_PREPROCESSED_PATH']}"
     else:
         inference_src = env['MLC_MLPERF_INFERENCE_SOURCE']
         run_dir = os.path.join(inference_src, 'language', 'llama2-70b')
@@ -52,4 +47,26 @@ def preprocess(i):
 def postprocess(i):
     env = i['env']
 
+    if is_true(str(env.get('MLC_DATASET_PREPROCESSED_BY_MLC', ''))):
+        env['PREPROCESSED_DATA_DIR'] = os.path.dirname(
+            env['MLC_OPENORCA_PREPROCESSED_ROOT'])
+        if is_true(env.get('MLC_DATASET_CALIBRATION', '')):
+            env['MLC_DATASET_CALIBRATION_PATH'] = os.path.join(
+                env['MLC_OPENORCA_PREPROCESSED_ROOT'],
+                "open_orca_gpt4_tokenized_llama.calibration_1000.pkl")
+            if env.get('MLC_TMP_DATASET_PREPROCESS_STEP_PROVIDER',
+                       '') == "nvidia":
+                env['MLC_NVIDIA_PREPROCESSED_CALIBRATION_DATASET_PATH'] = os.path.join(
+                    env['MLC_OPENORCA_PREPROCESSED_ROOT'],
+                    "preprocessed_data",
+                    "mlperf_llama2_openorca_calibration_1k")
+            env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_DATASET_CALIBRATION_PATH']
+            env['MLC_DATASET_OPENORCA_CALIBRATION_PATH'] = env['MLC_DATASET_CALIBRATION_PATH']
+        else:
+            env['MLC_DATASET_PREPROCESSED_PATH'] = os.path.join(
+                env['MLC_OPENORCA_PREPROCESSED_ROOT'],
+                "open_orca_gpt4_tokenized_llama.sampled_24576.pkl")
+            env['MLC_GET_DEPENDENT_CACHED_PATH'] = env['MLC_DATASET_PREPROCESSED_PATH']
+            env['MLC_DATASET_OPENORCA_PREPROCESSED_PATH'] = env['MLC_DATASET_PREPROCESSED_PATH']
+
     return {'return': 0}
diff --git a/script/get-preprocessed-dataset-openorca/meta.yaml b/script/get-preprocessed-dataset-openorca/meta.yaml
index 5e9f1a83d..dbcf7e46a 100644
--- a/script/get-preprocessed-dataset-openorca/meta.yaml
+++ b/script/get-preprocessed-dataset-openorca/meta.yaml
@@ -8,6 +8,10 @@ default_env:
   MLC_DATASET_CALIBRATION: 'no'
 deps:
 - tags: get,sys-utils-mlc
+  skip_if_env:
+    MLC_DATASET_PREPROCESSED_BY_MLC:
+    - 'on'
+    - 'yes'
 - names:
   - python
   - python3
@@ -38,6 +42,30 @@ deps:
 - names:
   - transformers
   tags: get,generic-python-lib,_package.transformers
+- names:
+  - datasets
+  tags: get,generic-python-lib,_package.datasets
+  version_max: 2.19.2
+  enable_if_env:
+    MLC_DATASET_PREPROCESSED_BY_MLC:
+    - 'on'
+    - 'yes'
+- names:
+  - numpy
+  tags: get,generic-python-lib,_package.numpy
+  version_max: 1.26.4
+  enable_if_env:
+    MLC_DATASET_PREPROCESSED_BY_MLC:
+    - 'on'
+    - 'yes'
+- names:
+  - pandas
+  tags: get,generic-python-lib,_package.pandas
+  version_max: 2.2.2
+  enable_if_env:
+    MLC_DATASET_PREPROCESSED_BY_MLC:
+    - 'on'
+    - 'yes'
 - skip_if_env:
     MLC_DATASET_PREPROCESSED_BY_MLC:
     - 'on'
@@ -87,7 +115,7 @@ variations:
       extra_cache_tags: openorca,preprocessed,dataset
       force_cache: true
       names:
-      - dae
+      - dae-openorca
       tags: download-and-extract,_rclone
       update_tags_from_env_with_prefix:
         _url.:
@@ -108,3 +136,11 @@ variations:
     new_env_keys:
     - MLC_DATASET_PREPROCESSED_PATH
     - MLC_DATASET_OPENORCA_PREPROCESSED_PATH
+    - PREPROCESSED_DATA_DIR
+  nvidia:
+    group: preprocess-step-provider
+    env:
+      MLC_TMP_DATASET_PREPROCESS_STEP_PROVIDER: nvidia
+  nvidia,calibration:
+    new_env_keys:
+    - MLC_NVIDIA_PREPROCESSED_CALIBRATION_DATASET_PATH
diff --git a/script/get-preprocessed-dataset-openorca/nvidia_preprocess.py b/script/get-preprocessed-dataset-openorca/nvidia_preprocess.py
new file mode 100644
index 000000000..013aa60e7
--- /dev/null
+++ b/script/get-preprocessed-dataset-openorca/nvidia_preprocess.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script to preprocess the data for Llama2-70b."""
+
+import argparse
+import logging
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from datasets import Dataset
+
+G_MAX_INPUT_TOK_LEN = 2048
+G_LLAMA2_EOS = 2
+
+
+def preprocess_data(data_dir, preprocessed_data_dir):
+    data_dir = Path(data_dir)
+    preprocessed_data_dir = Path(preprocessed_data_dir)
+    preprocessed_data_dir.mkdir(parents=True, exist_ok=True)
+
+    # Load inference data
+    inference_pkl_path = data_dir / "open_orca_gpt4_tokenized_llama.sampled_24576.pkl"
+    df = pd.read_pickle(inference_pkl_path)
+    toks = df['tok_input'].to_list()
+    toks_np = np.ones((len(toks), G_MAX_INPUT_TOK_LEN),
+                      dtype=np.int32) * G_LLAMA2_EOS
+    tok_len_np = df['tok_input_length'].to_numpy().astype(np.int32)
+
+    for i, q in enumerate(toks):
+        toks_np[i, :len(q)] = q
+        assert len(q) == tok_len_np[i]
+
+    np.save(preprocessed_data_dir / "input_ids_padded.npy", toks_np)
+    np.save(preprocessed_data_dir / "input_lens.npy", tok_len_np)
+
+    # Load calibration data
+    calib_pkl_path = data_dir / "open_orca_gpt4_tokenized_llama.calibration_1000.pkl"
+    calib_df = pd.read_pickle(calib_pkl_path)
+
+    if 'input' not in calib_df.columns:
+        raise ValueError("The DataFrame does not contain an 'input' column.")
+
+    hf_dataset = Dataset.from_pandas(calib_df[['input']])
+    hf_dataset = hf_dataset.rename_column("input", "text")
+
+    dataset_dir = preprocessed_data_dir / 'mlperf_llama2_openorca_calibration_1k'
+    dataset_dir.mkdir(parents=True, exist_ok=True)
+    hf_dataset.to_parquet(dataset_dir / "data.parquet")
+
+    logging.info(f"Done preprocessing llama2 at {preprocessed_data_dir}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Preprocess Llama2 data for TensorRT")
+    parser.add_argument(
+        "--data_dir", "-d",
+        help="Path to the input open_orca pickle file",
+        default="build/data"
+    )
+    parser.add_argument(
+        "--preprocessed_data_dir", "-o",
+        help="Output directory for the preprocessed data.",
+        default="build/preprocessed_data"
+    )
+    args = parser.parse_args()
+
+    preprocess_data(args.data_dir, args.preprocessed_data_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/script/get-preprocessed-dataset-openorca/run.sh b/script/get-preprocessed-dataset-openorca/run.sh
index aa7be3116..b637ad585 100644
--- a/script/get-preprocessed-dataset-openorca/run.sh
+++ b/script/get-preprocessed-dataset-openorca/run.sh
@@ -3,3 +3,16 @@
 cd ${MLC_RUN_DIR}
 echo "${MLC_RUN_CMD}"
 eval "${MLC_RUN_CMD}"
+test $? -eq 0 || exit $?
+
+if { [ "${MLC_DATASET_PREPROCESSED_BY_MLC}" = "true" ] || \
+     [ "${MLC_DATASET_PREPROCESSED_BY_MLC}" = "yes" ] || \
+     [ "${MLC_DATASET_PREPROCESSED_BY_MLC}" = "1" ]; } && \
+   [ "${MLC_TMP_DATASET_PREPROCESS_STEP_PROVIDER}" = "nvidia" ]; then
+
+    ${MLC_PYTHON_BIN_WITH_PATH} ${MLC_TMP_CURRENT_SCRIPT_PATH}/nvidia_preprocess.py \
+        -d ${MLC_OPENORCA_PREPROCESSED_ROOT} \
+        -o ${MLC_OPENORCA_PREPROCESSED_ROOT}/preprocessed_data
+    test $? -eq 0 || exit $?
+fi
+
diff --git a/script/run-docker-container/customize.py b/script/run-docker-container/customize.py
index 3c40bac93..aa27dcfb6 100644
--- a/script/run-docker-container/customize.py
+++ b/script/run-docker-container/customize.py
@@ -179,7 +179,10 @@ def postprocess(i):
     if env.get('MLC_DOCKER_ADD_NUM_GPUS', '') != '':
         run_opts += " --gpus={}".format(env['MLC_DOCKER_ADD_NUM_GPUS'])
     elif env.get('MLC_DOCKER_ADD_ALL_GPUS', '') != '':
-        run_opts += " --gpus=all"
+        if env.get('MLC_CONTAINER_TOOL') == "podman":
+            run_opts += " --device nvidia.com/gpu=all"
+        else:
+            run_opts += " --gpus=all"
 
     if env.get('MLC_DOCKER_SHM_SIZE', '') != '':
         run_opts += " --shm-size={}".format(env['MLC_DOCKER_SHM_SIZE'])