Squashed commit of the following:

commit e5010caebc5a135e40464a06432a5cf1fc965203 Author: Ritwik Das <ritdas@microsoft.comcd Date: Mon Jun 27 23:32:49 2022 +0000 Merged PR 2721: Remove unnecessary logging in benchmarks Remove unnecessary logging in benchmarks commit e0c5945d3ef218a5be858bc0934274793972abdb Author: Lisa Ong <onglisa@microsoft.com> Date: Tue Jun 21 01:12:02 2022 +0000 Merged PR 2674: Support emitting runtime array sizes in the Value DSL * Minimum set of changes to support runtime sizes in the Value DSL without transformations * Add a ScalarDimension type (name TBC) which is aliased to Scalar * Support variable ends in MemoryLayout, ScheduledLoopOp, RangeValueAnalysis * Use mlir::ShapedType::kDynamicSize and mlir::ShapedType::kDynamicStrideOrOffset as sentinel values, following the pattern in MemRefOps, TensorOps, etc. * TODO: E2E verification in the next PR * TODO: Python DSL changes in the next PR Output of mlir-translate for the runtime_sizes_all case, where %21, %22 and %23 are the runtime sizes for M, N, and K: ``` define void @NestMatMul(float* %0, float* %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, float* %7, float* %8, i64 %9, i64 %10, i64 %11, i64 %12, i64 %13, float* %14, float* %15, i64 %16, i64 %17, i64 %18, i64 %19, i64 %20, i64 %21, i64 %22, i64 %23) !dbg !3 { br label %25, !dbg !7 25: ; preds = %57, %24 %26 = phi i64 [ %58, %57 ], [ 0, %24 ] %27 = icmp slt i64 %26, %21, !dbg !9 br i1 %27, label %28, label %59, !dbg !10 28: ; preds = %25 br label %29, !dbg !11 29: ; preds = %55, %28 %30 = phi i64 [ %56, %55 ], [ 0, %28 ] %31 = icmp slt i64 %30, %22, !dbg !12 br i1 %31, label %32, label %57, !dbg !13 32: ; preds = %29 br label %33, !dbg !14 33: ; preds = %36, %32 %34 = phi i64 [ %54, %36 ], [ 0, %32 ] %35 = icmp slt i64 %34, %23, !dbg !15 br i1 %35, label %36, label %55, !dbg !16 36: ; preds = %33 %37 = mul i64 %26, %5, !dbg !17 %38 = add i64 %37, %34, !dbg !18 %39 = getelementptr float, float* %1, i64 %38, !dbg !19 %40 = load float, float* %39, align 4, !dbg !20 %41 = mul i64 %34, %12, !dbg !21 %42 = add i64 %41, %30, !dbg !22 %43 = getelementptr float, float* %8, i64 %42, !dbg !23 %44 = load float, float* %43, align 4, !dbg !24 %45 = fmul float %40, %44, !dbg !25 %46 = mul i64 %26, %19, !dbg !26 %47 = add i64 %46, %30, !dbg !27 %48 = getelementptr float, float* %15, i64 %47, !dbg !28 %49 = load float, float* %48, align 4, !dbg !29 %50 = fadd float %49, %45, !dbg !30 %51 = mul i64 %26, %19, !dbg !31 %52 = add i64 %51, %30, !dbg !32 %53 = getelementptr float, float* %15, i64 %52, !dbg !33 store float %50, float* %53, align 4, !dbg !34 %54 = add i64 %34, 1, !dbg !35 br label %33, !dbg !36 55: ; preds = %33 %56 = add i64 %30, 1, !dbg !37 br label %29, !dbg !38 57: ; preds = %29 %58 = add i64 %26, 1, !dbg !39 br label %25, !dbg !40 59: ; preds = %25 ret void, !dbg !41 } ``` Related work items: #3716, #3717 commit 51a07e5c60009c47c3b375b402ac96f47619ca8f Author: Ritwik Das <ritdas@microsoft.com> Date: Tue Jun 21 00:18:02 2022 +0000 Merged PR 2682: Add nvidia device optimized sizes and some benchmark fixes Add nvidia dev opt sizes and some bench fixes commit 6325b5e5bc68136d29e4a65d657699a4e781214d Author: Ritwik Das <ritdas@microsoft.com> Date: Sat Jun 18 17:59:50 2022 +0000 Merged PR 2676: Add automated weekly rocm baseline benchmark https://intelligentdevices.visualstudio.com/ELL/_build/results?buildId=41316&view=logs&j=4f7f213a-5f0f-58b0-1189-99ef12faf0d8&t=687344d2-d6b6-5d8c-dd9d-6aab558fd96c https://intelligentdevices.visualstudio.com/ELL/_build/results?buildId=41314&view=logs&j=4f7f213a-5f0f-58b0-1189-99ef12faf0d8 commit 940e599ff7026e7c41cb1b2566eec44d70709e96 Author: Ritwik Das <ritdas@microsoft.com> Date: Fri Jun 17 16:34:22 2022 +0000 Merged PR 2673: Add automated weekly baseline benchmarks on Nvidia GPU
microsoft · Jun 30, 2022 · e5d75a6 · e5d75a6
1 parent 2459eb8
commit e5d75a6
Show file tree

Hide file tree

Showing 66 changed files with 1,471 additions and 812 deletions.
diff --git a/.azure/cuda/cuda-benchmark-baseline.yml b/.azure/cuda/cuda-benchmark-baseline.yml
@@ -0,0 +1,93 @@
+schedules:
+  - cron: "0 0 * * 6"
+    displayName: Sat_5PM
+    branches:
+      include:
+        - main
+
+trigger: none
+
+jobs:
+  - job: "CUDA_Benchmarking_Baseline"
+    timeoutInMinutes: 360
+
+    pool:
+      name: LinuxNVGPUPool
+      demands:
+        - Target.Model -equals NVIDIA_RTX_A6000
+
+    steps:
+      - bash: |
+          sudo sysctl -w kernel.core_pattern="$(Build.SourcesDirectory)/build/core-%e-%s-%u-%g-%p-%t.dump"
+          ulimit -c unlimited
+          python -m pip install -U pip
+          python -m pip install -r $(Build.SourcesDirectory)/requirements.txt
+          python -m pip install -r $(Build.SourcesDirectory)/tools/benchmarkers/requirements.txt
+          python -m pip install -U cmake
+          echo "mkdir $HOME/.ccache"
+          mkdir $HOME/.ccache
+          echo "ln -s $HOME/.ccache $(System.DefaultWorkingDirectory)/ccache"
+          ln -s $HOME/.ccache $(System.DefaultWorkingDirectory)/ccache
+          conan remote add accera $(CONAN_REMOTE)
+          conan user -p $(CONAN_PWD) -r accera $(CONAN_USERNAME)
+        displayName: Install prereqs
+        env:
+          CONAN_PWD: $(CONAN_PWD)
+
+      - bash: |
+          git submodule init
+          git submodule update
+          ./external/vcpkg/bootstrap-vcpkg.sh
+          ./external/vcpkg/vcpkg install catch2 tomlplusplus
+        displayName: Update vcpkg dependencies
+        workingDirectory: "$(Build.SourcesDirectory)"
+
+      - bash: |
+          python ./setup.py build
+        displayName: Python build
+        workingDirectory: "$(Build.SourcesDirectory)"
+
+      - bash: |
+          ninja -C $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8 cublas_gemm
+        displayName: Cublas build
+        workingDirectory: "$(Build.SourcesDirectory)"
+
+      - bash: |
+          export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
+          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_rectangle_A6000.csv
+          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_square.csv
+          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_bert_assorted.csv
+          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_rectangle_A6000.csv
+          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_square.csv
+          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_bert_assorted.csv
+          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_resnet_inception.csv
+        displayName: Run CUBLAS benchmarks
+        workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
+        env:
+          ACCOUNT_KEY: $(ACCOUNT_KEY)
+
+      - bash: |
+          git clone https://github.com/NVIDIA/cutlass.git
+          cd cutlass && mkdir build && cd build
+          export CUDACXX=`which nvcc`
+          cmake .. -DCUTLASS_NVCC_ARCHS=86 -DCUTLASS_LIBRARY_KERNELS=all
+          make cutlass_profiler -j
+        displayName: CUTLASS build
+
+      - bash: |
+          export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
+          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_rectangle_A6000.csv
+          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_square.csv
+          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_bert_assorted.csv
+          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_rectangle_A6000.csv
+          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_square.csv
+          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_bert_assorted.csv
+          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_resnet_inception.csv
+        displayName: Run CUTLASS benchmarks
+        workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
+        env:
+          ACCOUNT_KEY: $(ACCOUNT_KEY)
+
+      - bash: |
+          rm -rf cutlass
+        displayName: Cleanup CUTLASS build dir
diff --git a/.azure/cuda/cuda-benchmark-fp16-bert.yml b/.azure/cuda/cuda-benchmark-fp16-bert.yml
@@ -51,7 +51,7 @@ jobs:
       # e.g. VISIBLE_DEVICES="0, 3"
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input hgemm_bert_assorted.csv --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+          python gpu_benchmark_tool.py --input gemm_bert_assorted.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
         displayName: Run fp16 benchmarks BERT
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:

diff --git a/.azure/cuda/cuda-benchmark-fp16-big.yml b/.azure/cuda/cuda-benchmark-fp16-big.yml
@@ -46,12 +46,17 @@ jobs:
         displayName: Python build
         workingDirectory: "$(Build.SourcesDirectory)"
 
-      # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to
-      # a comma-separated list of device IDs
-      # e.g. VISIBLE_DEVICES="0, 3"
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_big_fp16.csv --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+          python gpu_benchmark_tool.py --input gemm_big_A6000.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+        displayName: Run fp16 benchmarks BIG A6000
+        workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
+        env:
+          ACCOUNT_KEY: $(ACCOUNT_KEY)
+
+      - bash: |
+          export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
+          python gpu_benchmark_tool.py --input gemm_big.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
         displayName: Run fp16 benchmarks BIG
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:

diff --git a/.azure/cuda/cuda-benchmark-fp16.yml b/.azure/cuda/cuda-benchmark-fp16.yml
@@ -46,12 +46,17 @@ jobs:
         displayName: Python build
         workingDirectory: "$(Build.SourcesDirectory)"
 
-      # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to
-      # a comma-separated list of device IDs
-      # e.g. VISIBLE_DEVICES="0, 3"
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_small_fp16.csv --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+          python gpu_benchmark_tool.py --input gemm_small_A6000.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+        displayName: Run fp16 benchmarks A6000
+        workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
+        env:
+          ACCOUNT_KEY: $(ACCOUNT_KEY)
+
+      - bash: |
+          export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
+          python gpu_benchmark_tool.py --input gemm_small.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
         displayName: Run fp16 benchmarks
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:

diff --git a/.azure/cuda/cuda-benchmark-fp32-bert.yml b/.azure/cuda/cuda-benchmark-fp32-bert.yml
@@ -51,7 +51,7 @@ jobs:
       # e.g. VISIBLE_DEVICES="0, 3"
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input sgemm_bert_assorted.csv --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+          python gpu_benchmark_tool.py --input gemm_bert_assorted.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
         displayName: Run fp32 benchmarks BERT
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:

diff --git a/.azure/cuda/cuda-benchmark-fp32-big.yml b/.azure/cuda/cuda-benchmark-fp32-big.yml
@@ -46,12 +46,17 @@ jobs:
         displayName: Python build
         workingDirectory: "$(Build.SourcesDirectory)"
 
-      # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to
-      # a comma-separated list of device IDs
-      # e.g. VISIBLE_DEVICES="0, 3"
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_big_fp32.csv --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+          python gpu_benchmark_tool.py --input gemm_big_A6000.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+        displayName: Run fp32 benchmarks BIG A6000
+        workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
+        env:
+          ACCOUNT_KEY: $(ACCOUNT_KEY)
+
+      - bash: |
+          export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
+          python gpu_benchmark_tool.py --input gemm_big.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
         displayName: Run fp32 benchmarks BIG
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:

diff --git a/.azure/cuda/cuda-benchmark-fp32-resnet.yml b/.azure/cuda/cuda-benchmark-fp32-resnet.yml
@@ -51,7 +51,7 @@ jobs:
       # e.g. VISIBLE_DEVICES="0, 3"
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input sgemm_resnet_inception.csv --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+          python gpu_benchmark_tool.py --input gemm_resnet_inception.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
         displayName: Run fp32 benchmarks RESNET
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:

diff --git a/.azure/cuda/cuda-benchmark-fp32.yml b/.azure/cuda/cuda-benchmark-fp32.yml
@@ -46,14 +46,18 @@ jobs:
         displayName: Python build
         workingDirectory: "$(Build.SourcesDirectory)"
 
-      # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to
-      # a comma-separated list of device IDs
-      # e.g. VISIBLE_DEVICES="0, 3"
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_small_fp32.csv --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
-        displayName: Run fp32 benchmarks
+          python gpu_benchmark_tool.py --input gemm_small_A6000.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+        displayName: Run fp32 benchmarks A6000
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:
           ACCOUNT_KEY: $(ACCOUNT_KEY)
 
+      - bash: |
+          export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
+          python gpu_benchmark_tool.py --input gemm_small.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+        displayName: Run fp32 benchmarks
+        workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
+        env:
+          ACCOUNT_KEY: $(ACCOUNT_KEY)