Squashed commit of the following:

commit f3a1a2becb6740ae8cf7873b5029c6df140f5c19 Author: Kern Handa <kerha@microsoft.com> Date: Tue Jul 12 16:52:41 2022 +0000 Merged PR 2744: [doc] Fixes link in reference/functions/cast.md, revs version on all docs [doc] Fixes link in reference/functions/cast.md commit 23f4c8fbf2415b02e8b0090a76380d34790205fa Author: Lisa Ong <onglisa@microsoft.com> Date: Tue Jul 12 05:55:48 2022 +0000 Merged PR 2743: [DSL] Document implicit casting rules and the explicit `cast` function * Document implicit casting rules implemented by !2693 * Promote `acc.cast` to a documented function to give the user control to override implicit casting behavior commit 3ec63b62705327a65decc4da7ec4cb5412dc7299 Author: Kern Handa <kerha@microsoft.com> Date: Mon Jul 11 23:57:23 2022 +0000 Merged PR 2739: Updates ROCM tensorization pattern to handle casting Updates ROCM tensorization pattern to handle casting commit 60c082dd38ff1b0bc030a7e28dc19f553bad9099 Author: Mason Remy <masonr@microsoft.com> Date: Mon Jul 11 22:58:42 2022 +0000 Merged PR 2643: Some fixes for last major array caching in tensorization Some fixes for last major array caching in tensorization commit 812c3065b7d4d6c9d716acf4fb1df4be66ef101d Author: Kern Handa <kerha@microsoft.com> Date: Mon Jul 11 20:43:12 2022 +0000 Merged PR 2693: Updates DSL codegen to implicitly cast if possible Updates DSL codegen to implicitly cast if possible commit 6ed316e50e8f9e398f9ee6b8bfa8e6aa05fbffb1 Author: Ritwik Das <ritdas@microsoft.com> Date: Sat Jul 9 05:52:22 2022 +0000 Merged PR 2735: Pass multiple input files as comma-separated list to benchmark tool https://intelligentdevices.visualstudio.com/ELL/_build/results?buildId=41588&view=logs&j=d78921a4-2f18-50b0-77ad-4c6803f3371b&t=f97c60f6-ada7-5ec9-5ea1-510216c408e9 Above pipeline did not run the 2nd set of input sizes since the 1st process did not exit until pipeline timeout was hit. After the fix, we will always have a single job.
microsoft · Jul 13, 2022 · 1d3b1a3 · 1d3b1a3
1 parent e5d75a6
commit 1d3b1a3
Show file tree

Hide file tree

Showing 115 changed files with 1,131 additions and 467 deletions.
diff --git a/.azure/cuda/cuda-benchmark-baseline.yml b/.azure/cuda/cuda-benchmark-baseline.yml
@@ -54,13 +54,8 @@ jobs:
 
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_rectangle_A6000.csv
-          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_square.csv
-          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_bert_assorted.csv
-          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_rectangle_A6000.csv
-          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_square.csv
-          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_bert_assorted.csv
-          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_resnet_inception.csv
+          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_rectangle_A6000.csv,gemm_square.csv,gemm_bert_assorted.csv
+          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_rectangle_A6000.csv,gemm_square.csv,gemm_bert_assorted.csv,gemm_resnet_inception.csv
         displayName: Run CUBLAS benchmarks
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:
@@ -76,13 +71,8 @@ jobs:
 
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_rectangle_A6000.csv
-          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_square.csv
-          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_bert_assorted.csv
-          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_rectangle_A6000.csv
-          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_square.csv
-          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_bert_assorted.csv
-          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_resnet_inception.csv
+          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_rectangle_A6000.csv,gemm_square.csv,gemm_bert_assorted.csv
+          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_rectangle_A6000.csv,gemm_square.csv,gemm_bert_assorted.csv,gemm_resnet_inception.csv
         displayName: Run CUTLASS benchmarks
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:

diff --git a/.azure/cuda/cuda-benchmark-fp16-bert.yml b/.azure/cuda/cuda-benchmark-fp16-bert.yml
@@ -46,9 +46,6 @@ jobs:
         displayName: Python build
         workingDirectory: "$(Build.SourcesDirectory)"
 
-      # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to
-      # a comma-separated list of device IDs
-      # e.g. VISIBLE_DEVICES="0, 3"
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
           python gpu_benchmark_tool.py --input gemm_bert_assorted.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True

diff --git a/.azure/cuda/cuda-benchmark-fp16-big.yml b/.azure/cuda/cuda-benchmark-fp16-big.yml
@@ -48,16 +48,9 @@ jobs:
 
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_big_A6000.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+          python gpu_benchmark_tool.py --input gemm_big_A6000.csv,gemm_big.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
         displayName: Run fp16 benchmarks BIG A6000
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:
           ACCOUNT_KEY: $(ACCOUNT_KEY)
 
-      - bash: |
-          export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_big.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
-        displayName: Run fp16 benchmarks BIG
-        workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
-        env:
-          ACCOUNT_KEY: $(ACCOUNT_KEY)
diff --git a/.azure/cuda/cuda-benchmark-fp16.yml b/.azure/cuda/cuda-benchmark-fp16.yml
@@ -48,16 +48,9 @@ jobs:
 
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_small_A6000.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+          python gpu_benchmark_tool.py --input gemm_small_A6000.csv,gemm_small.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
         displayName: Run fp16 benchmarks A6000
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:
           ACCOUNT_KEY: $(ACCOUNT_KEY)
 
-      - bash: |
-          export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_small.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
-        displayName: Run fp16 benchmarks
-        workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
-        env:
-          ACCOUNT_KEY: $(ACCOUNT_KEY)
diff --git a/.azure/cuda/cuda-benchmark-fp32-bert.yml b/.azure/cuda/cuda-benchmark-fp32-bert.yml
@@ -46,9 +46,6 @@ jobs:
         displayName: Python build
         workingDirectory: "$(Build.SourcesDirectory)"
 
-      # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to
-      # a comma-separated list of device IDs
-      # e.g. VISIBLE_DEVICES="0, 3"
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
           python gpu_benchmark_tool.py --input gemm_bert_assorted.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True

diff --git a/.azure/cuda/cuda-benchmark-fp32-big.yml b/.azure/cuda/cuda-benchmark-fp32-big.yml
@@ -48,17 +48,8 @@ jobs:
 
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_big_A6000.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+          python gpu_benchmark_tool.py --input gemm_big_A6000.csv,gemm_big.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
         displayName: Run fp32 benchmarks BIG A6000
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:
           ACCOUNT_KEY: $(ACCOUNT_KEY)
-
-      - bash: |
-          export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_big.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
-        displayName: Run fp32 benchmarks BIG
-        workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
-        env:
-          ACCOUNT_KEY: $(ACCOUNT_KEY)
-
diff --git a/.azure/cuda/cuda-benchmark-fp32-resnet.yml b/.azure/cuda/cuda-benchmark-fp32-resnet.yml
@@ -46,9 +46,6 @@ jobs:
         displayName: Python build
         workingDirectory: "$(Build.SourcesDirectory)"
 
-      # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to
-      # a comma-separated list of device IDs
-      # e.g. VISIBLE_DEVICES="0, 3"
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
           python gpu_benchmark_tool.py --input gemm_resnet_inception.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True

diff --git a/.azure/cuda/cuda-benchmark-fp32.yml b/.azure/cuda/cuda-benchmark-fp32.yml
@@ -48,16 +48,8 @@ jobs:
 
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_small_A6000.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+          python gpu_benchmark_tool.py --input gemm_small_A6000.csv,gemm_small.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
         displayName: Run fp32 benchmarks A6000
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:
           ACCOUNT_KEY: $(ACCOUNT_KEY)
-
-      - bash: |
-          export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_small.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
-        displayName: Run fp32 benchmarks
-        workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
-        env:
-          ACCOUNT_KEY: $(ACCOUNT_KEY)