From 1d3b1a396ea2e0a7ab3fc314c4ceb38de52284df Mon Sep 17 00:00:00 2001 From: Kern Handa Date: Tue, 12 Jul 2022 10:23:04 -0700 Subject: [PATCH] Squashed commit of the following: commit f3a1a2becb6740ae8cf7873b5029c6df140f5c19 Author: Kern Handa Date: Tue Jul 12 16:52:41 2022 +0000 Merged PR 2744: [doc] Fixes link in reference/functions/cast.md, revs version on all docs [doc] Fixes link in reference/functions/cast.md commit 23f4c8fbf2415b02e8b0090a76380d34790205fa Author: Lisa Ong Date: Tue Jul 12 05:55:48 2022 +0000 Merged PR 2743: [DSL] Document implicit casting rules and the explicit `cast` function * Document implicit casting rules implemented by !2693 * Promote `acc.cast` to a documented function to give the user control to override implicit casting behavior commit 3ec63b62705327a65decc4da7ec4cb5412dc7299 Author: Kern Handa Date: Mon Jul 11 23:57:23 2022 +0000 Merged PR 2739: Updates ROCM tensorization pattern to handle casting Updates ROCM tensorization pattern to handle casting commit 60c082dd38ff1b0bc030a7e28dc19f553bad9099 Author: Mason Remy Date: Mon Jul 11 22:58:42 2022 +0000 Merged PR 2643: Some fixes for last major array caching in tensorization Some fixes for last major array caching in tensorization commit 812c3065b7d4d6c9d716acf4fb1df4be66ef101d Author: Kern Handa Date: Mon Jul 11 20:43:12 2022 +0000 Merged PR 2693: Updates DSL codegen to implicitly cast if possible Updates DSL codegen to implicitly cast if possible commit 6ed316e50e8f9e398f9ee6b8bfa8e6aa05fbffb1 Author: Ritwik Das Date: Sat Jul 9 05:52:22 2022 +0000 Merged PR 2735: Pass multiple input files as comma-separated list to benchmark tool https://intelligentdevices.visualstudio.com/ELL/_build/results?buildId=41588&view=logs&j=d78921a4-2f18-50b0-77ad-4c6803f3371b&t=f97c60f6-ada7-5ec9-5ea1-510216c408e9 Above pipeline did not run the 2nd set of input sizes since the 1st process did not exit until pipeline timeout was hit. After the fix, we will always have a single job. --- .azure/cuda/cuda-benchmark-baseline.yml | 18 +- .azure/cuda/cuda-benchmark-fp16-bert.yml | 3 - .azure/cuda/cuda-benchmark-fp16-big.yml | 9 +- .azure/cuda/cuda-benchmark-fp16.yml | 9 +- .azure/cuda/cuda-benchmark-fp32-bert.yml | 3 - .azure/cuda/cuda-benchmark-fp32-big.yml | 11 +- .azure/cuda/cuda-benchmark-fp32-resnet.yml | 3 - .azure/cuda/cuda-benchmark-fp32.yml | 10 +- .azure/rocm/rocm-benchmark-baseline.yml | 18 +- .azure/rocm/rocm-benchmark-fp16-bert.yml | 3 - .azure/rocm/rocm-benchmark-fp16-big.yml | 11 +- .azure/rocm/rocm-benchmark-fp16.yml | 11 +- .azure/rocm/rocm-benchmark-fp32-bert.yml | 3 - .azure/rocm/rocm-benchmark-fp32-big.yml | 11 +- .azure/rocm/rocm-benchmark-fp32-resnet.yml | 3 - .azure/rocm/rocm-benchmark-fp32.yml | 13 +- accera/acc-opt/test/thrifty_caching.mlir | 4 +- accera/acc-opt/test/value_mlir_test.cpp | 4 +- accera/ir/include/exec/ExecutionPlanOps.td | 4 +- accera/ir/include/value/ValueOps.td | 28 +- accera/ir/src/exec/ExecutionPlanOps.cpp | 3 + accera/python/accera/__init__.py | 2 +- accera/python/accera/test/dsl_tests.py | 29 +- accera/python/accera/test/mfma_tests.py | 214 ++++++++++++++- accera/python/accera/test/smoke_tests.py | 20 +- accera/python/accera/test/unit_tests.py | 22 +- accera/python/lib/src/ContainerTypes.cpp | 17 +- accera/python/lib/src/SchedulingTypes.cpp | 2 +- .../ExecutionPlanToAffineLoweringPass.cpp | 59 +++- accera/transforms/src/gpu/AcceraToGPUPass.cpp | 46 +++- accera/utilities/include/TypeTraits.h | 1 + accera/value/include/EmitterContext.h | 8 +- accera/value/include/MLIREmitterContext.h | 4 +- accera/value/include/Scalar.h | 5 +- accera/value/include/Value.h | 13 +- accera/value/include/ValueType.h | 55 ++++ accera/value/src/EmitterContext.cpp | 14 +- accera/value/src/MLIREmitterContext.cpp | 70 ++--- accera/value/src/Scalar.cpp | 255 ++++++++++++++---- accera/value/src/ScalarOperations.cpp | 25 +- accera/value/src/Value.cpp | 187 ++++++++++--- accera/value/src/ValueOperations.cpp | 10 - docs/.bumpversion.cfg | 2 +- docs/Case Studies/CONTRIBUTING.md | 2 +- docs/Case Studies/README.md | 2 +- docs/Install/Building_on_MacOS.md | 2 +- docs/Install/Building_on_Ubuntu.md | 2 +- docs/Install/Building_on_Windows.md | 2 +- docs/Install/Installing_Accera_on_MacOS.md | 2 +- docs/Install/Installing_Accera_on_Ubuntu.md | 2 +- docs/Install/Installing_Accera_on_Windows.md | 2 +- docs/Install/README.md | 2 +- docs/Manual/00 Introduction.md | 2 +- docs/Manual/01 Arrays.md | 2 +- docs/Manual/02 Simple Affine Loop Nests.md | 33 ++- docs/Manual/03 Schedules.md | 2 +- docs/Manual/04 Fusing.md | 2 +- docs/Manual/05 Targets.md | 2 +- docs/Manual/06 Plans - Caching.md | 2 +- ...07 Plans - Operations and Optimizations.md | 2 +- .../08 Deferred Layout of Constant Arrays.md | 2 +- docs/Manual/09 Parameters.md | 2 +- docs/Manual/10 Packages.md | 2 +- docs/Manual/README.md | 2 +- docs/Reference/accera.md | 5 +- docs/Reference/classes/Array/Array.md | 4 +- docs/Reference/classes/Array/Layout.md | 4 +- docs/Reference/classes/Array/Role.md | 4 +- .../classes/Array/deferred_layout.md | 4 +- docs/Reference/classes/Array/sub_array.md | 4 +- docs/Reference/classes/Nest/Nest.md | 4 +- docs/Reference/classes/Nest/create_plan.md | 4 +- .../Reference/classes/Nest/create_schedule.md | 4 +- docs/Reference/classes/Nest/get_indices.md | 4 +- .../Reference/classes/Nest/iteration_logic.md | 4 +- docs/Reference/classes/Package/Format.md | 4 +- docs/Reference/classes/Package/Mode.md | 4 +- docs/Reference/classes/Package/Package.md | 4 +- docs/Reference/classes/Package/Platform.md | 4 +- docs/Reference/classes/Package/add.md | 4 +- .../classes/Package/add_description.md | 4 +- docs/Reference/classes/Package/build.md | 4 +- docs/Reference/classes/Plan/bind.md | 4 +- docs/Reference/classes/Plan/cache.md | 4 +- docs/Reference/classes/Plan/kernelize.md | 4 +- docs/Reference/classes/Plan/parallelize.md | 4 +- docs/Reference/classes/Plan/tensorize.md | 4 +- docs/Reference/classes/Plan/unroll.md | 4 +- docs/Reference/classes/Plan/vectorize.md | 4 +- .../Reference/classes/Schedule/create_plan.md | 4 +- .../classes/Schedule/is_valid_loop_order.md | 4 +- docs/Reference/classes/Schedule/pad.md | 4 +- docs/Reference/classes/Schedule/reorder.md | 4 +- docs/Reference/classes/Schedule/skew.md | 4 +- docs/Reference/classes/Schedule/split.md | 4 +- docs/Reference/classes/Schedule/tile.md | 4 +- docs/Reference/classes/Target/Architecture.md | 4 +- docs/Reference/classes/Target/Category.md | 4 +- docs/Reference/classes/Target/Model.md | 4 +- docs/Reference/classes/Target/Runtime.md | 4 +- docs/Reference/classes/Target/Target.md | 4 +- .../enumerations/MMASchedulingPolicy.md | 4 +- docs/Reference/enumerations/MMAShape.md | 4 +- docs/Reference/enumerations/ScalarType.md | 4 +- docs/Reference/functions/cast.md | 77 ++++++ .../functions/create_parameter_grid.md | 4 +- docs/Reference/functions/create_parameters.md | 4 +- docs/Reference/functions/fuse.md | 4 +- docs/Reference/safety_analysis.md | 4 +- docs/Tutorials/Hello_MatMul.md | 2 +- docs/Tutorials/Hello_MatMul_GPU.md | 2 +- docs/Tutorials/Optimized_MatMul.md | 2 +- docs/Tutorials/Pi3_Cross_Compilation.md | 2 +- docs/Tutorials/README.md | 2 +- tools/benchmarkers/gpu_benchmark_tool.py | 19 +- 115 files changed, 1131 insertions(+), 467 deletions(-) create mode 100644 docs/Reference/functions/cast.md diff --git a/.azure/cuda/cuda-benchmark-baseline.yml b/.azure/cuda/cuda-benchmark-baseline.yml index 41e00371..34ee2931 100644 --- a/.azure/cuda/cuda-benchmark-baseline.yml +++ b/.azure/cuda/cuda-benchmark-baseline.yml @@ -54,13 +54,8 @@ jobs: - bash: | export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 - python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_rectangle_A6000.csv - python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_square.csv - python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_bert_assorted.csv - python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_rectangle_A6000.csv - python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_square.csv - python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_bert_assorted.csv - python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_resnet_inception.csv + python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_rectangle_A6000.csv,gemm_square.csv,gemm_bert_assorted.csv + python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_rectangle_A6000.csv,gemm_square.csv,gemm_bert_assorted.csv,gemm_resnet_inception.csv displayName: Run CUBLAS benchmarks workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" env: @@ -76,13 +71,8 @@ jobs: - bash: | export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 - python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_rectangle_A6000.csv - python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_square.csv - python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_bert_assorted.csv - python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_rectangle_A6000.csv - python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_square.csv - python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_bert_assorted.csv - python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_resnet_inception.csv + python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_rectangle_A6000.csv,gemm_square.csv,gemm_bert_assorted.csv + python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_rectangle_A6000.csv,gemm_square.csv,gemm_bert_assorted.csv,gemm_resnet_inception.csv displayName: Run CUTLASS benchmarks workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" env: diff --git a/.azure/cuda/cuda-benchmark-fp16-bert.yml b/.azure/cuda/cuda-benchmark-fp16-bert.yml index f5963e6b..bc129347 100644 --- a/.azure/cuda/cuda-benchmark-fp16-bert.yml +++ b/.azure/cuda/cuda-benchmark-fp16-bert.yml @@ -46,9 +46,6 @@ jobs: displayName: Python build workingDirectory: "$(Build.SourcesDirectory)" - # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to - # a comma-separated list of device IDs - # e.g. VISIBLE_DEVICES="0, 3" - bash: | export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 python gpu_benchmark_tool.py --input gemm_bert_assorted.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True diff --git a/.azure/cuda/cuda-benchmark-fp16-big.yml b/.azure/cuda/cuda-benchmark-fp16-big.yml index f526df4e..0e03ec49 100644 --- a/.azure/cuda/cuda-benchmark-fp16-big.yml +++ b/.azure/cuda/cuda-benchmark-fp16-big.yml @@ -48,16 +48,9 @@ jobs: - bash: | export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 - python gpu_benchmark_tool.py --input gemm_big_A6000.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True + python gpu_benchmark_tool.py --input gemm_big_A6000.csv,gemm_big.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True displayName: Run fp16 benchmarks BIG A6000 workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" env: ACCOUNT_KEY: $(ACCOUNT_KEY) - - bash: | - export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 - python gpu_benchmark_tool.py --input gemm_big.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True - displayName: Run fp16 benchmarks BIG - workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" - env: - ACCOUNT_KEY: $(ACCOUNT_KEY) diff --git a/.azure/cuda/cuda-benchmark-fp16.yml b/.azure/cuda/cuda-benchmark-fp16.yml index 0c1148c8..a7755b68 100644 --- a/.azure/cuda/cuda-benchmark-fp16.yml +++ b/.azure/cuda/cuda-benchmark-fp16.yml @@ -48,16 +48,9 @@ jobs: - bash: | export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 - python gpu_benchmark_tool.py --input gemm_small_A6000.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True + python gpu_benchmark_tool.py --input gemm_small_A6000.csv,gemm_small.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True displayName: Run fp16 benchmarks A6000 workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" env: ACCOUNT_KEY: $(ACCOUNT_KEY) - - bash: | - export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 - python gpu_benchmark_tool.py --input gemm_small.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True - displayName: Run fp16 benchmarks - workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" - env: - ACCOUNT_KEY: $(ACCOUNT_KEY) diff --git a/.azure/cuda/cuda-benchmark-fp32-bert.yml b/.azure/cuda/cuda-benchmark-fp32-bert.yml index cb374382..d9642653 100644 --- a/.azure/cuda/cuda-benchmark-fp32-bert.yml +++ b/.azure/cuda/cuda-benchmark-fp32-bert.yml @@ -46,9 +46,6 @@ jobs: displayName: Python build workingDirectory: "$(Build.SourcesDirectory)" - # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to - # a comma-separated list of device IDs - # e.g. VISIBLE_DEVICES="0, 3" - bash: | export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 python gpu_benchmark_tool.py --input gemm_bert_assorted.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True diff --git a/.azure/cuda/cuda-benchmark-fp32-big.yml b/.azure/cuda/cuda-benchmark-fp32-big.yml index e73e3a44..47a2bce8 100644 --- a/.azure/cuda/cuda-benchmark-fp32-big.yml +++ b/.azure/cuda/cuda-benchmark-fp32-big.yml @@ -48,17 +48,8 @@ jobs: - bash: | export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 - python gpu_benchmark_tool.py --input gemm_big_A6000.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True + python gpu_benchmark_tool.py --input gemm_big_A6000.csv,gemm_big.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True displayName: Run fp32 benchmarks BIG A6000 workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" env: ACCOUNT_KEY: $(ACCOUNT_KEY) - - - bash: | - export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 - python gpu_benchmark_tool.py --input gemm_big.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True - displayName: Run fp32 benchmarks BIG - workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" - env: - ACCOUNT_KEY: $(ACCOUNT_KEY) - diff --git a/.azure/cuda/cuda-benchmark-fp32-resnet.yml b/.azure/cuda/cuda-benchmark-fp32-resnet.yml index 53cc2260..f10b4362 100644 --- a/.azure/cuda/cuda-benchmark-fp32-resnet.yml +++ b/.azure/cuda/cuda-benchmark-fp32-resnet.yml @@ -46,9 +46,6 @@ jobs: displayName: Python build workingDirectory: "$(Build.SourcesDirectory)" - # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to - # a comma-separated list of device IDs - # e.g. VISIBLE_DEVICES="0, 3" - bash: | export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 python gpu_benchmark_tool.py --input gemm_resnet_inception.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True diff --git a/.azure/cuda/cuda-benchmark-fp32.yml b/.azure/cuda/cuda-benchmark-fp32.yml index 5d479913..b371ebf1 100644 --- a/.azure/cuda/cuda-benchmark-fp32.yml +++ b/.azure/cuda/cuda-benchmark-fp32.yml @@ -48,16 +48,8 @@ jobs: - bash: | export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 - python gpu_benchmark_tool.py --input gemm_small_A6000.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True + python gpu_benchmark_tool.py --input gemm_small_A6000.csv,gemm_small.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True displayName: Run fp32 benchmarks A6000 workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" env: ACCOUNT_KEY: $(ACCOUNT_KEY) - - - bash: | - export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 - python gpu_benchmark_tool.py --input gemm_small.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True - displayName: Run fp32 benchmarks - workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" - env: - ACCOUNT_KEY: $(ACCOUNT_KEY) \ No newline at end of file diff --git a/.azure/rocm/rocm-benchmark-baseline.yml b/.azure/rocm/rocm-benchmark-baseline.yml index bd2bb907..2e4bdeee 100644 --- a/.azure/rocm/rocm-benchmark-baseline.yml +++ b/.azure/rocm/rocm-benchmark-baseline.yml @@ -52,13 +52,8 @@ jobs: - bash: | export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 - python gpu_benchmark_tool.py --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --rocblas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/rocblas/rocblas_gemm --input gemm_rectangle_MI100.csv - python gpu_benchmark_tool.py --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --rocblas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/rocblas/rocblas_gemm --input gemm_square.csv - python gpu_benchmark_tool.py --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --rocblas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/rocblas/rocblas_gemm --input gemm_bert_assorted.csv - python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --rocblas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/rocblas/rocblas_gemm --input gemm_rectangle_MI100.csv - python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --rocblas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/rocblas/rocblas_gemm --input gemm_square.csv - python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --rocblas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/rocblas/rocblas_gemm --input gemm_bert_assorted.csv - python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --rocblas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/rocblas/rocblas_gemm --input gemm_resnet_inception.csv + python gpu_benchmark_tool.py --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --rocblas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/rocblas/rocblas_gemm --input gemm_rectangle_MI100.csv,gemm_square.csv,gemm_bert_assorted.csv + python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --rocblas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/rocblas/rocblas_gemm --input gemm_rectangle_MI100.csv,gemm_square.csv,gemm_bert_assorted.csv,gemm_resnet_inception.csv displayName: Run ROCBLAS benchmarks workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" env: @@ -73,13 +68,8 @@ jobs: - bash: | export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 - python gpu_benchmark_tool.py --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --composable_kernel $(System.DefaultWorkingDirectory)/composable_kernel/build/bin/ckProfiler --input gemm_rectangle_MI100.csv - python gpu_benchmark_tool.py --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --composable_kernel $(System.DefaultWorkingDirectory)/composable_kernel/build/bin/ckProfiler --input gemm_square.csv - python gpu_benchmark_tool.py --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --composable_kernel $(System.DefaultWorkingDirectory)/composable_kernel/build/bin/ckProfiler --input gemm_bert_assorted.csv - python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --composable_kernel $(System.DefaultWorkingDirectory)/composable_kernel/build/bin/ckProfiler --input gemm_rectangle_MI100.csv - python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --composable_kernel $(System.DefaultWorkingDirectory)/composable_kernel/build/bin/ckProfiler --input gemm_square.csv - python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --composable_kernel $(System.DefaultWorkingDirectory)/composable_kernel/build/bin/ckProfiler --input gemm_bert_assorted.csv - python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --composable_kernel $(System.DefaultWorkingDirectory)/composable_kernel/build/bin/ckProfiler --input gemm_resnet_inception.csv + python gpu_benchmark_tool.py --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --composable_kernel $(System.DefaultWorkingDirectory)/composable_kernel/build/bin/ckProfiler --input gemm_rectangle_MI100.csv,gemm_square.csv,gemm_bert_assorted.csv + python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --composable_kernel $(System.DefaultWorkingDirectory)/composable_kernel/build/bin/ckProfiler --input gemm_rectangle_MI100.csv,gemm_square.csv,gemm_bert_assorted.csv,gemm_resnet_inception.csv displayName: Run CK benchmarks (FP16, RECT) workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" env: diff --git a/.azure/rocm/rocm-benchmark-fp16-bert.yml b/.azure/rocm/rocm-benchmark-fp16-bert.yml index 003d3420..1cf846ef 100644 --- a/.azure/rocm/rocm-benchmark-fp16-bert.yml +++ b/.azure/rocm/rocm-benchmark-fp16-bert.yml @@ -43,9 +43,6 @@ jobs: displayName: Python build workingDirectory: "$(Build.SourcesDirectory)" - # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to - # a comma-separated list of device IDs - # e.g. VISIBLE_DEVICES="0, 3" - bash: | export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 export LD_LIBRARY_PATH=${ROCM_PATH}/lib diff --git a/.azure/rocm/rocm-benchmark-fp16-big.yml b/.azure/rocm/rocm-benchmark-fp16-big.yml index 575968af..2bb82b1e 100644 --- a/.azure/rocm/rocm-benchmark-fp16-big.yml +++ b/.azure/rocm/rocm-benchmark-fp16-big.yml @@ -47,18 +47,9 @@ jobs: export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 export LD_LIBRARY_PATH=${ROCM_PATH}/lib echo "LD_LIBRARY_PATH" ${LD_LIBRARY_PATH} - python gpu_benchmark_tool.py --input gemm_big_MI100.csv --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True + python gpu_benchmark_tool.py --input gemm_big_MI100.csv,gemm_big.csv --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True displayName: Run fp16 benchmarks BIG MI100 workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" env: ACCOUNT_KEY: $(ACCOUNT_KEY) - - bash: | - export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 - export LD_LIBRARY_PATH=${ROCM_PATH}/lib - echo "LD_LIBRARY_PATH" ${LD_LIBRARY_PATH} - python gpu_benchmark_tool.py --input gemm_big.csv --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True - displayName: Run fp16 benchmarks BIG - workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" - env: - ACCOUNT_KEY: $(ACCOUNT_KEY) diff --git a/.azure/rocm/rocm-benchmark-fp16.yml b/.azure/rocm/rocm-benchmark-fp16.yml index 19c36bac..280ab905 100644 --- a/.azure/rocm/rocm-benchmark-fp16.yml +++ b/.azure/rocm/rocm-benchmark-fp16.yml @@ -47,18 +47,9 @@ jobs: export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 export LD_LIBRARY_PATH=${ROCM_PATH}/lib echo "LD_LIBRARY_PATH" ${LD_LIBRARY_PATH} - python gpu_benchmark_tool.py --input gemm_small_MI100.csv --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True + python gpu_benchmark_tool.py --input gemm_small_MI100.csv,gemm_small.csv --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True displayName: Run fp16 benchmarks MI100 workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" env: ACCOUNT_KEY: $(ACCOUNT_KEY) - - bash: | - export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 - export LD_LIBRARY_PATH=${ROCM_PATH}/lib - echo "LD_LIBRARY_PATH" ${LD_LIBRARY_PATH} - python gpu_benchmark_tool.py --input gemm_small.csv --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True - displayName: Run fp16 benchmarks - workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" - env: - ACCOUNT_KEY: $(ACCOUNT_KEY) diff --git a/.azure/rocm/rocm-benchmark-fp32-bert.yml b/.azure/rocm/rocm-benchmark-fp32-bert.yml index 588cedc3..7db56566 100644 --- a/.azure/rocm/rocm-benchmark-fp32-bert.yml +++ b/.azure/rocm/rocm-benchmark-fp32-bert.yml @@ -43,9 +43,6 @@ jobs: displayName: Python build workingDirectory: "$(Build.SourcesDirectory)" - # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to - # a comma-separated list of device IDs - # e.g. VISIBLE_DEVICES="0, 3" - bash: | export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 export LD_LIBRARY_PATH=${ROCM_PATH}/lib diff --git a/.azure/rocm/rocm-benchmark-fp32-big.yml b/.azure/rocm/rocm-benchmark-fp32-big.yml index b067ac19..05102fd0 100644 --- a/.azure/rocm/rocm-benchmark-fp32-big.yml +++ b/.azure/rocm/rocm-benchmark-fp32-big.yml @@ -47,19 +47,10 @@ jobs: export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 export LD_LIBRARY_PATH=${ROCM_PATH}/lib echo "LD_LIBRARY_PATH" ${LD_LIBRARY_PATH} - python gpu_benchmark_tool.py --input gemm_big_MI100.csv --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True + python gpu_benchmark_tool.py --input gemm_big_MI100.csv,gemm_big.csv --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True displayName: Run fp32 benchmarks BIG MI100 workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" env: ACCOUNT_KEY: $(ACCOUNT_KEY) - - bash: | - export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 - export LD_LIBRARY_PATH=${ROCM_PATH}/lib - echo "LD_LIBRARY_PATH" ${LD_LIBRARY_PATH} - python gpu_benchmark_tool.py --input gemm_big.csv --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True - displayName: Run fp32 benchmarks BIG - workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" - env: - ACCOUNT_KEY: $(ACCOUNT_KEY) diff --git a/.azure/rocm/rocm-benchmark-fp32-resnet.yml b/.azure/rocm/rocm-benchmark-fp32-resnet.yml index 615e7e7d..44fdcb1e 100644 --- a/.azure/rocm/rocm-benchmark-fp32-resnet.yml +++ b/.azure/rocm/rocm-benchmark-fp32-resnet.yml @@ -43,9 +43,6 @@ jobs: displayName: Python build workingDirectory: "$(Build.SourcesDirectory)" - # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to - # a comma-separated list of device IDs - # e.g. VISIBLE_DEVICES="0, 3" - bash: | export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 export LD_LIBRARY_PATH=${ROCM_PATH}/lib diff --git a/.azure/rocm/rocm-benchmark-fp32.yml b/.azure/rocm/rocm-benchmark-fp32.yml index dfd1b80f..e98f93a4 100644 --- a/.azure/rocm/rocm-benchmark-fp32.yml +++ b/.azure/rocm/rocm-benchmark-fp32.yml @@ -47,19 +47,8 @@ jobs: export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 export LD_LIBRARY_PATH=${ROCM_PATH}/lib echo "LD_LIBRARY_PATH" ${LD_LIBRARY_PATH} - python gpu_benchmark_tool.py --input gemm_small_MI100.csv --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True + python gpu_benchmark_tool.py --input gemm_small_MI100.csv,gemm_small.csv --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True displayName: Run fp32 benchmarks MI100 workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" env: ACCOUNT_KEY: $(ACCOUNT_KEY) - - - bash: | - export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8 - export LD_LIBRARY_PATH=${ROCM_PATH}/lib - echo "LD_LIBRARY_PATH" ${LD_LIBRARY_PATH} - python gpu_benchmark_tool.py --input gemm_small.csv --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True - displayName: Run fp32 benchmarks - workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers" - env: - ACCOUNT_KEY: $(ACCOUNT_KEY) - diff --git a/accera/acc-opt/test/thrifty_caching.mlir b/accera/acc-opt/test/thrifty_caching.mlir index c9c9aa56..335c7e25 100644 --- a/accera/acc-opt/test/thrifty_caching.mlir +++ b/accera/acc-opt/test/thrifty_caching.mlir @@ -17,10 +17,10 @@ module @test_thrifty_caching_simple_input_cache attributes {llvm.data_layout = " %4 = accln.sym_index {name = "k_i"} #accln<"index{k_i,8}"> %5 = accln.sym_index {name = "j_o"} #accln<"index{j_o,5}"> "accv.lambda"() ( { - %6 = "accxp.make_cache"() {memorySpace = 0 : i64, multiCacheAccessIndices = [], offsetAccessIndices = [], offsetArrayToCacheAccessMap = affine_map<(d0) -> (d0)>} : () -> memref + %6 = "accxp.make_cache"() {activeBlockToCacheMap = affine_map<(d0, d1) -> (d0, d1)>, memorySpace = 0 : i64, multiCacheAccessIndices = [], offsetAccessIndices = [], offsetArrayToCacheAccessMap = affine_map<(d0) -> (d0)>} : () -> memref %7 = "accxp.begin_create_cache"(%arg0, %6, %arg0, %1, %2, %0, %4, %1, %2) {activeBlockCache, cacheAccessMaps = {manualCacheDimOrder = [0, 1]}, cacheHierarchyLevel = 0 : i64, cacheIndex = #accln<"index{i_i,4}">, cacheRegionBaseIndices = [[#accln<"index{i,0}">], [#accln<"index{k,2}">]], cacheRegionRelevantIndexRanges = [#accln<"indexrange{i_i,4}={0:4:1}">, #accln<"indexrange{k_i,8}={0:32:1}">], dimReorderCache, id = 0 : i64, operand_segment_sizes = dense<[1, 1, 1, 4, 2]> : vector<5xi32>, thrifty, triggerIndex = #accln<"index{i_i,4}">} : (memref<32x32xf32, affine_map<(d0, d1) -> (d0 * 32 + d1)>>, memref, memref<32x32xf32, affine_map<(d0, d1) -> (d0 * 32 + d1)>>, index, index, index, index, index, index) -> index "accxp.end_cache_region"(%7) : (index) -> () - %8 = "accxp.make_cache"() {memorySpace = 0 : i64, multiCacheAccessIndices = [], offsetAccessIndices = [], offsetArrayToCacheAccessMap = affine_map<(d0) -> (d0)>} : () -> memref + %8 = "accxp.make_cache"() {activeBlockToCacheMap = affine_map<(d0, d1) -> (d0, d1)>, memorySpace = 0 : i64, multiCacheAccessIndices = [], offsetAccessIndices = [], offsetArrayToCacheAccessMap = affine_map<(d0) -> (d0)>} : () -> memref %9 = "accxp.begin_create_cache"(%arg1, %8, %arg1, %5, %2, %3, %4, %5) {activeBlockCache, cacheAccessMaps = {manualCacheDimOrder = [0, 1]}, cacheHierarchyLevel = 0 : i64, cacheIndex = #accln<"index{k_o,7}">, cacheRegionBaseIndices = [[#accln<"index{k,2}">], [#accln<"index{j,1}">], [#accln<"index{k,2}">]], cacheRegionRelevantIndexRanges = [#accln<"indexrange{k_o,7}={0:32:32}">, #accln<"indexrange{j_i,6}={0:16:1}">, #accln<"indexrange{k_i,8}={0:32:1}">], dimReorderCache, id = 1 : i64, operand_segment_sizes = dense<[1, 1, 1, 4, 1]> : vector<5xi32>, thrifty, triggerIndex = #accln<"index{k_o,7}">} : (memref<32x32xf32, affine_map<(d0, d1) -> (d0 * 32 + d1)>>, memref, memref<32x32xf32, affine_map<(d0, d1) -> (d0 * 32 + d1)>>, index, index, index, index, index) -> index "accxp.end_cache_region"(%9) : (index) -> () affine.for %arg3 = 0 to 32 step 4 { diff --git a/accera/acc-opt/test/value_mlir_test.cpp b/accera/acc-opt/test/value_mlir_test.cpp index 985470c0..4dc6d45f 100644 --- a/accera/acc-opt/test/value_mlir_test.cpp +++ b/accera/acc-opt/test/value_mlir_test.cpp @@ -1735,7 +1735,7 @@ TEST_CASE("jit_int8_simple_matrix_multiply_test1") }); auto computeKernel = Kernel("compute", [&, i = i, j = j, k = k]() { - auto a = UnsignedCast(A(i, k), cType); + auto a = Cast(A(i, k), cType); auto b = Cast(B(k, j), cType); auto prod = a * b; @@ -1860,7 +1860,7 @@ TEST_CASE("jit_int8_simple_matrix_multiply_test3") auto [kOuter, kInner] = schedule.Split(k, kernelK); auto computeKernel = Kernel("compute", [&, i = i, j = j, k = k]() { - auto a = UnsignedCast(A(i, k), cType); + auto a = Cast(A(i, k), cType); auto b = Cast(B(k, j), cType); auto prod = a * b; C(i, j) += prod; diff --git a/accera/ir/include/exec/ExecutionPlanOps.td b/accera/ir/include/exec/ExecutionPlanOps.td index 6d164f04..2899313a 100644 --- a/accera/ir/include/exec/ExecutionPlanOps.td +++ b/accera/ir/include/exec/ExecutionPlanOps.td @@ -101,6 +101,7 @@ def accxp_MakeCacheOp : accxp_Op<"make_cache"> { corresponding position in the cache. }]; let arguments = (ins MemorySpaceAttr:$memorySpace, + AffineMapAttr:$activeBlockToCacheMap, AffineMapAttr:$offsetArrayToCacheAccessMap, ArrayAttr:$offsetAccessIndices, ArrayAttr:$multiCacheAccessIndices); @@ -112,6 +113,7 @@ def accxp_MakeCacheOp : accxp_Op<"make_cache"> { OpBuilder<(ins "mlir::MemRefType":$cache, "MemorySpace":$memorySpace, + "AffineMap":$activeBlockToCacheMap, "AffineMap":$offsetArrayToCacheAccessMap, "const std::vector&":$offsetAccessIndices, "const std::vector&":$multiCacheAccessIndices @@ -184,7 +186,7 @@ def accxp_ActiveBlockCacheCopyOp : accxp_Op<"active_block_cache_copy", [AttrSize AffineMapAttr:$activeBlockToCacheMap, UnitAttr:$toCache, UnitAttr:$thrifty, - UnitAttr:$skipBarriers, // TODO : remove this once barrier analysis hoists barriers out of loops + UnitAttr:$skipBarriers, // TODO : remove this once barrier analysis hoists barriers out of loops OptionalAttr:$vectorizationInfo); } diff --git a/accera/ir/include/value/ValueOps.td b/accera/ir/include/value/ValueOps.td index 663cb305..71a6fb47 100644 --- a/accera/ir/include/value/ValueOps.td +++ b/accera/ir/include/value/ValueOps.td @@ -1217,12 +1217,23 @@ def accv_MMALoadSyncOp : accv_Op<"wmma_load_sync", UI32Attr:$mmaShapeType, I8Attr:$operandType, Variadic:$indices, - AffineMapAttr:$map + AffineMapAttr:$map, // TODO : maybe rename this positionMap or something? + OptionalAttr:$tileAccessMap ); let results = (outs MemRefRankOf<[I8, I32, F16, F32], [1]>:$result); let builders = [ + OpBuilder<(ins "Type":$resultType, "Value":$memref, "MMAShape":$mmaShapeType, "MMAOperandType":$operandType, "AffineMap":$map, "ValueRange":$mapOperands, "AffineMap":$tileAccessMap), [{ + assert(map.getNumInputs() == mapOperands.size() && "inconsistent index info"); + $_state.addOperands(memref); + $_state.addOperands(mapOperands); + $_state.addAttribute("mmaShapeType", $_builder.getUI32IntegerAttr((uint32_t)mmaShapeType)); + $_state.addAttribute("operandType", $_builder.getI8IntegerAttr((int8_t)operandType)); + $_state.addAttribute(getMapAttrName(), AffineMapAttr::get(map)); + $_state.addAttribute(getTileAccessMapAttrName(), AffineMapAttr::get(tileAccessMap)); + $_state.addTypes(resultType); + }]>, OpBuilder<(ins "Type":$resultType, "Value":$memref, "MMAShape":$mmaShapeType, "MMAOperandType":$operandType, "AffineMap":$map, "ValueRange":$mapOperands), [{ assert(map.getNumInputs() == mapOperands.size() && "inconsistent index info"); $_state.addOperands(memref); @@ -1255,6 +1266,7 @@ def accv_MMALoadSyncOp : accv_Op<"wmma_load_sync", } static StringRef getMapAttrName() { return "map"; } + static StringRef getTileAccessMapAttrName() { return "tileAccessMap"; } }]; let assemblyFormat = [{ @@ -1283,9 +1295,20 @@ def accv_MMAStoreSyncOp : accv_Op<"wmma_store_sync", [ Arg, "",[MemWrite]>:$memref, UI32Attr:$mmaShapeType, Variadic:$indices, - AffineMapAttr:$map); + AffineMapAttr:$map, // TODO : maybe rename this positionMap or something? + OptionalAttr:$tileAccessMap); let builders = [ + OpBuilder<(ins "Value":$src, "Value":$memref, "MMAShape":$mmaShapeType, "AffineMap":$map, "ValueRange":$mapOperands, "AffineMap":$tileAccessMap), [{ + assert(map.getNumInputs() == mapOperands.size() && "inconsistent index info"); + $_state.addOperands(src); + $_state.addOperands(memref); + $_state.addOperands(mapOperands); + $_state.addAttribute("mmaShapeType", $_builder.getUI32IntegerAttr((uint32_t)mmaShapeType)); + $_state.addAttribute(getMapAttrName(), AffineMapAttr::get(map)); + $_state.addAttribute(getTileAccessMapAttrName(), AffineMapAttr::get(tileAccessMap)); + + }]>, OpBuilder<(ins "Value":$src, "Value":$memref, "MMAShape":$mmaShapeType, "AffineMap":$map, "ValueRange":$mapOperands), [{ assert(map.getNumInputs() == mapOperands.size() && "inconsistent index info"); $_state.addOperands(src); @@ -1320,6 +1343,7 @@ def accv_MMAStoreSyncOp : accv_Op<"wmma_store_sync", [ } static StringRef getMapAttrName() { return "map"; } + static StringRef getTileAccessMapAttrName() { return "tileAccessMap"; } }]; let assemblyFormat = [{ diff --git a/accera/ir/src/exec/ExecutionPlanOps.cpp b/accera/ir/src/exec/ExecutionPlanOps.cpp index 89fdb2a7..a4640bd4 100644 --- a/accera/ir/src/exec/ExecutionPlanOps.cpp +++ b/accera/ir/src/exec/ExecutionPlanOps.cpp @@ -974,6 +974,7 @@ namespace executionPlan cacheType, memorylocation, mlir::AffineMap::getMultiDimIdentityMap(cacheType.getRank(), builder.getContext()), + mlir::AffineMap::getMultiDimIdentityMap(cacheType.getRank(), builder.getContext()), std::vector{}, std::vector{}); } @@ -982,6 +983,7 @@ namespace executionPlan OperationState& result, mlir::MemRefType cacheType, accera::ir::value::MemorySpace memorylocation, + AffineMap activeBlockToCacheMap, AffineMap offsetArrayToCacheAccessMap, const std::vector& offsetAccessIndices, const std::vector& multiCacheAccessIndices) @@ -993,6 +995,7 @@ namespace executionPlan result, cacheType, memorylocation, + activeBlockToCacheMap, offsetArrayToCacheAccessMap, offsetAccessIndexAttrs, multiCacheAccessIndexAttrs); diff --git a/accera/python/accera/__init__.py b/accera/python/accera/__init__.py index f4488f4b..38482454 100644 --- a/accera/python/accera/__init__.py +++ b/accera/python/accera/__init__.py @@ -18,7 +18,7 @@ from ._lang_python import CompilerOptions, ScalarType, _GetTargetDeviceFromName from ._lang_python import ( abs, max, min, ceil, floor, sqrt, exp, log, log10, log2, sin, cos, tan, sinh, cosh, tanh, logical_and, logical_or, - logical_not, _cast, _unsigned_cast + logical_not, cast ) # Global initialization diff --git a/accera/python/accera/test/dsl_tests.py b/accera/python/accera/test/dsl_tests.py index c368de37..e4dd5638 100644 --- a/accera/python/accera/test/dsl_tests.py +++ b/accera/python/accera/test/dsl_tests.py @@ -438,7 +438,8 @@ def _(): def test_array_value_type_cast(self) -> None: A = Array( - shape=(256, 32), role=Array.Role.INPUT, layout=Array.Layout.FIRST_MAJOR + shape=(256, 32), role=Array.Role.INPUT, layout=Array.Layout.FIRST_MAJOR, + element_type=ScalarType.float32, ) B = Array( shape=(256, 32), @@ -452,7 +453,7 @@ def test_array_value_type_cast(self) -> None: @nest.iteration_logic def _(): - A[i, j] = 5 # implicit cast from int8 to float + A[i, j] = 5 # implicit cast from int8 to float32 B[i, j] = 10 # implicit cast from int8 to int32 A_test = np.random.random((256, 32)).astype(np.float32) @@ -691,12 +692,36 @@ def test_arithmetic_operations(self) -> None: nest, A, B, C = self._create_nest((16, 10, 11), type=t) i, j, k = nest.get_indices() + int_val = 2 + float_val = 1.5 + @nest.iteration_logic def _(): C[i, j] = A[i, k] + B[k, j] # test assignment C[i, j] += A[i, k] - B[k, j] C[i, j] += A[i, k] * B[k, j] C[i, j] += A[i, k] / B[k, j] + + if t != ScalarType.float16: + C[i, j] += int_val + A[i,k] + C[i, j] += int_val - A[i, k] + C[i, j] += int_val * A[i,k] + C[i, j] += int_val / A[i, k] + C[i, j] += A[i, k] + int_val + C[i, j] += A[i, k] - int_val + C[i, j] += A[i, k] * int_val + C[i, j] += A[i, k] / int_val + + if t in FLOAT_TYPES: + C[i, j] += float_val + A[i,k] + C[i, j] += float_val - A[i, k] + C[i, j] += float_val * A[i,k] + C[i, j] += float_val / A[i, k] + C[i, j] += A[i, k] + float_val + C[i, j] += A[i, k] - float_val + C[i, j] += A[i, k] * float_val + C[i, j] += A[i, k] / float_val + C[i, j] += -A[i, k] C[i, j] += A[i, k] // B[k, j] C[i, j] += A[i, k] % B[k, j] diff --git a/accera/python/accera/test/mfma_tests.py b/accera/python/accera/test/mfma_tests.py index 0d215db2..5ef24c37 100644 --- a/accera/python/accera/test/mfma_tests.py +++ b/accera/python/accera/test/mfma_tests.py @@ -99,7 +99,7 @@ def _check_cu_has_no_mfma(self, test_name, verifier): ) checker.check_not('__builtin_amdgcn_mfma_') checker.run() - + def _get_np_datatype(self, p): from bfloat16 import bfloat16 if p.element_type == ScalarType.bfloat16: @@ -109,7 +109,8 @@ def _get_np_datatype(self, p): def _get_random_data(self, p): datatype = self._get_np_datatype(p) - if p.element_type == ScalarType.int8 or p.element_type == ScalarType.int32: + if p.element_type in [ScalarType.int8, ScalarType.int16, ScalarType.int32, ScalarType.int64, ScalarType.uint8, + ScalarType.uint16, ScalarType.uint32, ScalarType.uint64]: return np.random.randint(-2, 2, p.shape, datatype) return np.random.random(p.shape).astype(datatype) @@ -183,7 +184,7 @@ def _rocm_matmul(self, test_name, M, N, K, block_tile, outer_tile_k, thread_tile if thread_tile is None: thread_tile = block_tile - + if inner_tile_k is None: inner_tile_k = outer_tile_k @@ -285,7 +286,7 @@ def _rocm_batch_matmul(self, test_name, batch_count, M, N, K, block_tile, outer_ if thread_tile is None: thread_tile = block_tile - + if inner_tile_k is None: inner_tile_k = outer_tile_k @@ -308,7 +309,7 @@ def _(): i: block_tile[0], j: block_tile[1], k: outer_tile_k - }) + }) else: ii, jj, kk = schedule.tile({ i: block_tile[0], @@ -538,7 +539,7 @@ def _(): i: outer_tile_x, j: outer_tile_y }) - + mma_shape = _MMAShape.M16xN16xK4_B1 num_total_passes = 4 target = Target(Target.Model.AMD_MI100) @@ -834,7 +835,7 @@ def _(): if ROCM_AVAILABLE: self._verify_matmul(function, A, B, C, v) - + # This should produce MFMA instructions def test_rocm_tensorize_multi_block_multi_warp_output_reordered_indices(self) -> None: from accera import Array, Nest, Package, ScalarType, Target @@ -1406,7 +1407,7 @@ def _(): shutil.rmtree(output_dir, ignore_errors=True) with verifiers.VerifyPackage(self, test_name, output_dir) as v: - package.build(test_name, + package.build(test_name, format=Package.Format.MLIR_DYNAMIC, mode=Package.Mode.RELEASE, output_dir=output_dir @@ -1561,7 +1562,7 @@ def _(): j: outer_tile_n, k: outer_tile_k }) - + target = Target(Target.Model.NVIDIA_RTX_A6000) if tensorize: tensor_splits = target.tensor_core_info.compute_tensor_splits(mma_shape, num_total_passes) @@ -1583,7 +1584,7 @@ def _(): kk: default_thread_splits[2] }) schedule.reorder(i, j, k, ii, jj, kk, iii, jjj, kkk) - + plan = schedule.create_plan(target=target) plan.bind( mapping={ @@ -1643,7 +1644,7 @@ def test_cuda_non_square_last_major_output(self) -> None: def test_cuda_non_square_last_major_inputs_output(self) -> None: self._cuda_cache_tensorize(M=1280, N=768, K=1024, outer_tile_m=16, outer_tile_n=16, outer_tile_k=128, test_name="test_cuda_non_square_last_major_inputs_output", tensorize=False, - cache=False, vectorize=False, element_type=ScalarType.float32, + cache=False, vectorize=False, element_type=ScalarType.float32, array_layouts=[Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR]) def test_cuda_tensorize_non_square_last_major_inputs(self) -> None: @@ -2302,6 +2303,197 @@ def test_batchgemm_rocm_vectorized_cache_double_buffering_tensorize_square_bspli test_name="test_batchgemm_rocm_vectorized_cache_double_buffering_tensorize_square_bsplit", double_buffer=True, double_buffer_location=_MemorySpace.PRIVATE, vectorize=True, use_static_offsets=False) + def _test_rocm_cache_memory_order_helper(self, a_layout, a_cache_layout, double_buffer, vectorize, tensorize) -> None: + + from accera import Array, Nest, Package, ScalarType, Target + + M = 512 + N = 512 + K = 512 + + # Pick the A and B tile sizes to be smaller than the number of threads per block + outer_tile_x = 64 + outer_tile_y = 64 + outer_tile_k = 64 + + A = Array(role=Array.Role.INPUT, element_type=ScalarType.float32, shape=(M, K), layout=a_layout) + B = Array(role=Array.Role.INPUT, element_type=ScalarType.float32, shape=(K, N), layout=Array.Layout.FIRST_MAJOR) + C = Array(role=Array.Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(M, N), layout=Array.Layout.FIRST_MAJOR) + + nest = Nest(shape=(M, N, K)) + i, j, k = nest.get_indices() + + @nest.iteration_logic + def _(): + C[i, j] += A[i, k] * B[k, j] + + schedule = nest.create_schedule() + + ii, jj, kk = schedule.tile({ + i: outer_tile_x, + j: outer_tile_y, + k: outer_tile_k + }) + + mma_shape = _MMAShape.M16xN16xK4_B1 + num_total_passes = 4 + target = Target(Target.Model.AMD_MI100) + tensor_splits = target.tensor_core_info.compute_tensor_splits(mma_shape, num_total_passes=num_total_passes) + + iii, jjj, kkk = schedule.tile({ + ii: tensor_splits[0], + jj: tensor_splits[1], + kk: tensor_splits[2] + }) + if tensorize: + outer_nest_order = (i, j, k, ii, jj, kk) + plan, tensorization_indices = schedule._create_tensorizable_plan(target, block_indices=(i, j), warp_indices=(ii, jj), tensor_indices=(iii, jjj, kkk), outer_nest_order=outer_nest_order, mma_shape=mma_shape) + plan.tensorize(indices=tensorization_indices, mma_shape=mma_shape, num_total_passes=num_total_passes) + else: + schedule.reorder(i, j, k, ii, jj, kk, iii, jjj, kkk) + plan = schedule.create_plan(target) + plan.bind( + mapping={ + i: target.GridUnit.BLOCK_Y, + j: target.GridUnit.BLOCK_X, + iii: target.GridUnit.THREAD_Y, + jjj: target.GridUnit.THREAD_X + } + ) + + plan.cache(A, + index=ii, + double_buffer=double_buffer, + vectorize=vectorize, + location=target.MemorySpace.SHARED, + layout=a_cache_layout + ) + + layout_str_map = { + Array.Layout.FIRST_MAJOR : "F", + Array.Layout.LAST_MAJOR : "L" + } + name_parts = [ + "test_rocm_cache_tensorized", + layout_str_map[a_layout], + layout_str_map[a_cache_layout], + f"_db_{double_buffer}", + f"_vec_{vectorize}", + f"_tens_{tensorize}" + ] + test_name = "_".join(name_parts) + package = Package() + function = package.add(plan, args=(A, B, C), base_name=test_name) + + self._verify_matrix_multiplication_function( + function, + package, + test_name, + check_correctness=ROCM_AVAILABLE, + file_list=[f"{test_name}.cu", f"{test_name}.hat"], + package_format=Package.Format.DEFAULT | Package.Format.MLIR + ) + + # FIRST-FIRST + def test_rocm_memory_order_cache_tensorized_F_F_T_T_T(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.FIRST_MAJOR, True, True, True) + + def test_rocm_memory_order_cache_tensorized_F_F_T_T_F(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.FIRST_MAJOR, True, True, False) + + def test_rocm_memory_order_cache_tensorized_F_F_T_F_T(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.FIRST_MAJOR, True, False, True) + + def test_rocm_memory_order_cache_tensorized_F_F_F_T_T(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.FIRST_MAJOR, False, True, True) + + def test_rocm_memory_order_cache_tensorized_F_F_T_F_F(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.FIRST_MAJOR, True, False, False) + + def test_rocm_memory_order_cache_tensorized_F_F_F_F_T(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.FIRST_MAJOR, False, False, True) + + def test_rocm_memory_order_cache_tensorized_F_F_F_T_F(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.FIRST_MAJOR, False, True, False) + + def test_rocm_memory_order_cache_tensorized_F_F_F_F_F(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.FIRST_MAJOR, False, False, False) + + # FIRST-LAST + def test_rocm_memory_order_cache_tensorized_F_L_T_T_T(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.LAST_MAJOR, True, True, True) + + def test_rocm_memory_order_cache_tensorized_F_L_T_T_F(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.LAST_MAJOR, True, True, False) + + def test_rocm_memory_order_cache_tensorized_F_L_T_F_T(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.LAST_MAJOR, True, False, True) + + def test_rocm_memory_order_cache_tensorized_F_L_F_T_T(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.LAST_MAJOR, False, True, True) + + def test_rocm_memory_order_cache_tensorized_F_L_T_F_F(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.LAST_MAJOR, True, False, False) + + def test_rocm_memory_order_cache_tensorized_F_L_F_F_T(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.LAST_MAJOR, False, False, True) + + def test_rocm_memory_order_cache_tensorized_F_L_F_T_F(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.LAST_MAJOR, False, True, False) + + def test_rocm_memory_order_cache_tensorized_F_L_F_F_F(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.LAST_MAJOR, False, False, False) + + # LAST-FIRST + def test_rocm_memory_order_cache_tensorized_L_F_T_T_T(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.FIRST_MAJOR, True, True, True) + + def test_rocm_memory_order_cache_tensorized_L_F_T_T_F(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.FIRST_MAJOR, True, True, False) + + def test_rocm_memory_order_cache_tensorized_L_F_T_F_T(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.FIRST_MAJOR, True, False, True) + + def test_rocm_memory_order_cache_tensorized_L_F_F_T_T(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.FIRST_MAJOR, False, True, True) + + def test_rocm_memory_order_cache_tensorized_L_F_T_F_F(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.FIRST_MAJOR, True, False, False) + + def test_rocm_memory_order_cache_tensorized_L_F_F_F_T(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.FIRST_MAJOR, False, False, True) + + def test_rocm_memory_order_cache_tensorized_L_F_F_T_F(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.FIRST_MAJOR, False, True, False) + + def test_rocm_memory_order_cache_tensorized_L_F_F_F_F(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.FIRST_MAJOR, False, False, False) + + # LAST-LAST + def test_rocm_memory_order_cache_tensorized_L_L_T_T_T(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR, True, True, True) + + def test_rocm_memory_order_cache_tensorized_L_L_T_T_F(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR, True, True, False) + + def test_rocm_memory_order_cache_tensorized_L_L_T_F_T(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR, True, False, True) + + def test_rocm_memory_order_cache_tensorized_L_L_F_T_T(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR, False, True, True) + + def test_rocm_memory_order_cache_tensorized_L_L_T_F_F(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR, True, False, False) + + def test_rocm_memory_order_cache_tensorized_L_L_F_F_T(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR, False, False, True) + + def test_rocm_memory_order_cache_tensorized_L_L_F_T_F(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR, False, True, False) + + def test_rocm_memory_order_cache_tensorized_L_L_F_F_F(self) -> None: + self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR, False, False, False) + if __name__ == '__main__': unittest.main(verbosity=10) diff --git a/accera/python/accera/test/smoke_tests.py b/accera/python/accera/test/smoke_tests.py index bca45001..1ea6716a 100644 --- a/accera/python/accera/test/smoke_tests.py +++ b/accera/python/accera/test/smoke_tests.py @@ -735,7 +735,7 @@ def test_two_vulkan_gpu_matmul(self) -> None: @expectedFailure(FailedReason.NOT_IN_CORE, "function that contains multiple nests") def test_int8_matmul(self) -> None: - from accera import _cast, _unsigned_cast + from accera import cast # Define our matrix sizes M = 128 @@ -756,7 +756,7 @@ def get_compute_col_sums_schedule(): @compute_col_sums_nest.iteration_logic def _(): - b = _unsigned_cast(B[k, j], ScalarType.int32) + b = cast(B[k, j], ScalarType.int32) col_sums[j] += b return compute_col_sums_nest.create_schedule() @@ -777,7 +777,7 @@ def get_compute_row_sums_schedule(): @compute_row_sums_nest.iteration_logic def _(): - a = _cast(A[i, k], ScalarType.int32) + a = cast(A[i, k], ScalarType.int32) row_sums[i] += a return compute_row_sums_nest.create_schedule() @@ -818,8 +818,8 @@ def get_matmul_schedule(): @matmul_nest.iteration_logic def _(): - a = _cast(A[i, k], ScalarType.int32) - b = _unsigned_cast(B[k, j], ScalarType.int32) + a = cast(A[i, k], ScalarType.int32) + b = cast(B[k, j], ScalarType.int32) C[i, j] += a * b return matmul_nest.create_schedule() @@ -3955,9 +3955,8 @@ def file_check_fn(verifier): checker.check('affine.for %[[lpt_iv:[a-z0-9_]+]] = 0 to 2 {') checker.check('affine.for %[[Thread_X_iv:[a-z0-9_]+]] = 0 to 1 {') checker.check('affine.for %[[Thread_Y_iv:[a-z0-9_]+]] = 0 to 1 {') - checker.check('%[[Loaded_A_Val:[0-9_]+]] = affine.load %[[Array_A]][%[[lpt_iv]] * 8 + symbol(%[[Block_X]]) * 16 - (symbol(%[[Block_X]]) floordiv 160) * 2560 + symbol(%[[Thread_Y]]) floordiv 2 - ((%[[lpt_iv]] * 8 + symbol(%[[Thread_Y]]) floordiv 2) floordiv 16) * 16, %[[k_iv]] + %[[kk_iv]] + symbol(%[[Thread_Y]]) * 16 + symbol(%[[Thread_X]]) - (symbol(%[[Thread_Y]]) floordiv 2) * 32] : memref<2560x2048xf32, affine_map<(d0, d1) -> (d0 * 2048 + d1)>>') - # Note: (16*thread_y) % 32 == (16*thread_y) - 32((16*thread_y) floordiv 32) == (16*thread_y) - 32(thread_y floordiv 2) - checker.check('affine.store %[[Loaded_A_Val]], %[[Cache_A]][(%[[lpt_iv]] * 8 + symbol(%[[Thread_Y]]) floordiv 2) mod 16, symbol(%[[Thread_Y]]) * 16 + symbol(%[[Thread_X]]) - (symbol(%[[Thread_Y]]) floordiv 2) * 32] : memref<16x32xf32, 3>') + checker.check('%[[Loaded_A_Val:[0-9_]+]] = affine.load %[[Array_A]][symbol(%[[Block_X]]) * 16 + symbol(%[[Thread_X]]) - (symbol(%[[Block_X]]) floordiv 160) * 2560, %[[lpt_iv]] * 16 + %[[k_iv]] + %[[kk_iv]] + symbol(%[[Thread_Y]])] : memref<2560x2048xf32, affine_map<(d0, d1) -> (d0 * 2048 + d1)>>') + checker.check('affine.store %[[Loaded_A_Val]], %[[Cache_A]][symbol(%[[Thread_X]]), %[[lpt_iv]] * 16 + symbol(%[[Thread_Y]])] : memref<16x32xf32, 3>') # check the B matrix load / store checker.check('"accv.lambda"() ( {') @@ -4518,7 +4517,7 @@ def _(): def test_fill_fp16(self): from accera import Array, Nest, Package, ScalarType - from accera import _cast + from accera import cast # Define our vector sizes N = 2**16 @@ -4530,7 +4529,7 @@ def test_fill_fp16(self): @nest.iteration_logic def _(): - Out[i] = _cast(2, ScalarType.float16) + Out[i] = cast(2, ScalarType.float16) schedule = nest.create_schedule() plan = schedule.create_plan() @@ -4850,6 +4849,5 @@ def _(): package_format=Package.Format.DEFAULT | Package.Format.MLIR ) - if __name__ == '__main__': unittest.main(verbosity=10) diff --git a/accera/python/accera/test/unit_tests.py b/accera/python/accera/test/unit_tests.py index c7ae4fe5..e9330c4e 100644 --- a/accera/python/accera/test/unit_tests.py +++ b/accera/python/accera/test/unit_tests.py @@ -101,7 +101,7 @@ def test_scalar_conditionals(self) -> None: self.assertIsInstance(s != 10, Scalar) def test_cast(self) -> None: - from accera import _cast, Array, Nest, ScalarType + from accera import cast, Array, Nest, ScalarType for t in [ScalarType.int8, ScalarType.int16, ScalarType.int32, ScalarType.int64, ScalarType.float16, ScalarType.float32, ScalarType.float64]: M, S, N = 16, 11, 10 @@ -114,10 +114,10 @@ def test_cast(self) -> None: @nest.iteration_logic def _(): - C[i, j] += A[i, k] + _cast(B[k, j], t) - C[i, j] += A[i, k] - _cast(B[k, j], t) - C[i, j] += A[i, k] * _cast(B[k, j], t) - C[i, j] += A[i, k] / _cast(B[k, j], t) + C[i, j] += A[i, k] + cast(B[k, j], t) + C[i, j] += A[i, k] - cast(B[k, j], t) + C[i, j] += A[i, k] * cast(B[k, j], t) + C[i, j] += A[i, k] / cast(B[k, j], t) package = Package() package.add(nest, args=(A, B, C), base_name=f"test_cast_{t.name}") @@ -126,8 +126,8 @@ def _(): package.build(package_name, output_dir=TEST_PACKAGE_DIR) def test_unsigned_cast(self) -> None: - from accera import _unsigned_cast, Array, Nest, ScalarType - for t in [ScalarType.uint8, ScalarType.uint16]: # TODO: , ScalarType.uint32, ScalarType.uint64]: + from accera import cast, Array, Nest, ScalarType + for t in [ScalarType.uint8, ScalarType.uint16, ScalarType.uint32, ScalarType.uint64]: M, S, N = 16, 11, 10 A = Array(role=Array.Role.INPUT, element_type=t, shape=(M, S)) B = Array(role=Array.Role.INPUT, element_type=ScalarType.int32, shape=(S, N)) @@ -138,10 +138,10 @@ def test_unsigned_cast(self) -> None: @nest.iteration_logic def _(): - C[i, j] += A[i, k] + _unsigned_cast(B[k, j], t) - C[i, j] += A[i, k] - _unsigned_cast(B[k, j], t) - C[i, j] += A[i, k] * _unsigned_cast(B[k, j], t) - C[i, j] += A[i, k] / _unsigned_cast(B[k, j], t) + C[i, j] += A[i, k] + cast(B[k, j], t) + C[i, j] += A[i, k] - cast(B[k, j], t) + C[i, j] += A[i, k] * cast(B[k, j], t) + C[i, j] += A[i, k] / cast(B[k, j], t) package = Package() package.add(nest, args=(A, B, C), base_name=f"test_unsigned_cast_{t.name}") diff --git a/accera/python/lib/src/ContainerTypes.cpp b/accera/python/lib/src/ContainerTypes.cpp index 297c6ec2..b4157f53 100644 --- a/accera/python/lib/src/ContainerTypes.cpp +++ b/accera/python/lib/src/ContainerTypes.cpp @@ -151,12 +151,9 @@ General constructor. module.def("logical_or", [](value::Scalar s1, value::Scalar s2) { return value::Cast(s1 || s2, s1.GetType()); }); - module.def("_cast", [](value::Scalar s, value::ValueType type) { + module.def("cast", [](value::Scalar s, value::ValueType type) { return value::Cast(s, type); }); - module.def("_unsigned_cast", [](value::Scalar s, value::ValueType type) { - return value::UnsignedCast(s, type); - }); } void DefineArrayClass(py::module& module) @@ -358,6 +355,18 @@ specific to the EmitterContext, specified by the Emittable type. .def(float() * py::self) .def(py::self * int()) .def(int() * py::self) + .def(py::self + float()) + .def(float() + py::self) + .def(py::self + int()) + .def(int() + py::self) + .def(py::self / float()) + .def(float() / py::self) + .def(py::self / int()) + .def(int() / py::self) + .def(py::self - float()) + .def(float() - py::self) + .def(py::self - int()) + .def(int() - py::self) .def("__rshift__", [](value::Scalar& s, value::Scalar& shift) { return value::UnsignedShiftRight(s, value::Cast(shift, s.GetType())); }) diff --git a/accera/python/lib/src/SchedulingTypes.cpp b/accera/python/lib/src/SchedulingTypes.cpp index 49e06428..6feb40bb 100644 --- a/accera/python/lib/src/SchedulingTypes.cpp +++ b/accera/python/lib/src/SchedulingTypes.cpp @@ -29,7 +29,7 @@ void DefineScheduleClass(py::module& module) "split", [](value::Schedule& sched, value::ScalarIndex& i, int factor) { auto ret = sched.Split(i, factor); using std::swap; - swap(i, ret.first); + i = std::move(ret.first); return ret.second; }, "i"_a, diff --git a/accera/transforms/src/exec/ExecutionPlanToAffineLoweringPass.cpp b/accera/transforms/src/exec/ExecutionPlanToAffineLoweringPass.cpp index fd0a1d9a..03c4ea37 100644 --- a/accera/transforms/src/exec/ExecutionPlanToAffineLoweringPass.cpp +++ b/accera/transforms/src/exec/ExecutionPlanToAffineLoweringPass.cpp @@ -18,7 +18,6 @@ #include #include -#include #include #include #include @@ -26,6 +25,7 @@ #include #include #include +#include #include #include @@ -41,11 +41,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -1724,6 +1726,7 @@ MakeCacheOp UpdateActiveBlockCacheAccess(PatternRewriter& rewriter, auto replacementOp = rewriter.create(shapedMakeCacheOp.getLoc(), shapedMakeCacheOp.getType(), shapedMakeCacheOp.memorySpace(), + activeBlockToCacheMap, arrayToCacheMap, offsetAccessIndices, multiCacheAccessIndices); @@ -1950,7 +1953,8 @@ v::MMALoadSyncOp CreateMMALoad(mlir::OpBuilder& builder, if (auto srcCacheOp = mlir::dyn_cast_or_null(src.getDefiningOp())) { mlir::AffineValueMap loadAccessInfo = srcCacheOp.insertCachePosition(builder.getInsertionBlock(), baseArrayPosition); - return builder.create(loc, resultType, src, mmaShapeType, operandType, loadAccessInfo.getAffineMap(), loadAccessInfo.getOperands()); + mlir::AffineMap tileAccessMap = srcCacheOp.activeBlockToCacheMap(); + return builder.create(loc, resultType, src, mmaShapeType, operandType, loadAccessInfo.getAffineMap(), loadAccessInfo.getOperands(), tileAccessMap); } else { @@ -1968,7 +1972,8 @@ v::MMAStoreSyncOp CreateMMAStore(mlir::OpBuilder& builder, if (auto dstCacheOp = mlir::dyn_cast_or_null(dst.getDefiningOp())) { mlir::AffineValueMap storeAccessInfo = dstCacheOp.insertCachePosition(builder.getInsertionBlock(), baseArrayPosition); - return builder.create(loc, value, dst, mmaShapeType, storeAccessInfo.getAffineMap(), storeAccessInfo.getOperands()); + mlir::AffineMap tileAccessMap = dstCacheOp.activeBlockToCacheMap(); + return builder.create(loc, value, dst, mmaShapeType, storeAccessInfo.getAffineMap(), storeAccessInfo.getOperands(), tileAccessMap); } else { @@ -2725,23 +2730,25 @@ LogicalResult ActiveBlockCacheCopyOpRewrite::matchAndRewrite(ActiveBlockCacheCop mlir::AffineMap cacheFillNestMap = mlir::AffineMap::get(5, 0, cacheFillNestToFlatExpr); - llvm::SmallVector multiCacheStrides; + llvm::SmallVector outerArrayStrides; int64_t activeBlockOffset; // TODO : do we need to leverage this in any way? we're currently just arranging the threads according to fast/slow dimensions of the logical memref - auto strideResult = mlir::getStridesAndOffset(memRefType, multiCacheStrides, activeBlockOffset); + auto strideResult = mlir::getStridesAndOffset(memRefType, outerArrayStrides, activeBlockOffset); assert(succeeded(strideResult)); - auto numMultiCacheDims = multiCacheStrides.size() - activeBlockRank; - std::vector activeBlockStrides(multiCacheStrides.begin() + numMultiCacheDims, multiCacheStrides.end()); + auto numOuterArrayMultiCacheDims = outerArrayStrides.size() - activeBlockRank; + std::vector outerArrayActiveBlockStrides(outerArrayStrides.begin() + numOuterArrayMultiCacheDims, outerArrayStrides.end()); // We want to traverse the dimensions of the active block in increasing stride order, so keep track of the logical dimensions and sort them std::vector> activeBlockLogicalDimAndStride; size_t dimIdxCounter = 0; - std::transform(activeBlockStrides.begin(), activeBlockStrides.end(), std::back_inserter(activeBlockLogicalDimAndStride), [&](int64_t stride) { + std::transform(outerArrayActiveBlockStrides.begin(), outerArrayActiveBlockStrides.end(), std::back_inserter(activeBlockLogicalDimAndStride), [&](int64_t stride) { return std::make_pair(dimIdxCounter++, stride); }); +#if 0 // TODO : re-enable for coalesced reads and fix transpose caching with double-buffering case std::sort(activeBlockLogicalDimAndStride.begin(), activeBlockLogicalDimAndStride.end(), [](const std::pair& left, const std::pair& right) { return left.second < right.second; }); +#endif auto cumulativeStride = 1; std::vector flatToActiveBlockExprs(activeBlockRank); @@ -4708,6 +4715,7 @@ MakeCacheOp CreateDoubleBufferTempArray(mlir::OpBuilder& builder, return builder.create(parentLambda.getLoc(), tempArrayType, memorySpaceEnum, + info.activeBlockToCacheMap, tempArrayAccessMap, tempArrayOffsetIndices, tempArrayMultiCacheAccessIndices); @@ -6437,6 +6445,29 @@ LogicalResult TensorizeAffineForOpConversion::matchAndRewrite(AffineForOp affine // 4. load C (void)innerLoopBodyIter++; + mlir::Operation* castOp = nullptr; + // TODO: Figure out if there's a better way to list the possible OPs that can be used to change type but still be valid IR (maybe an Accera specific cast OP that takes an attr instead?) + if (innerLoopBodyIter != innerLoopBodyEnd && isa< + // Ops that have CastOpInterface + mlir::FPExtOp, + mlir::FPToSIOp, + mlir::FPToUIOp, + mlir::FPTruncOp, + mlir::IndexCastOp, + mlir::SIToFPOp, + mlir::UIToFPOp, + mlir::UnrealizedConversionCastOp, + + // Ops that don't have CastOpInterface + mlir::SignExtendIOp, + mlir::TruncateIOp, + mlir::ZeroExtendIOp>( + innerLoopBodyIter)) + { + castOp = &(*innerLoopBodyIter++); + opsToErase.push(castOp); + } + if (innerLoopBodyIter == innerLoopBodyEnd || !isa(*innerLoopBodyIter)) { return reportMatchFailure(affineForOp, "Failed to match the load from C Op"); @@ -6449,11 +6480,11 @@ LogicalResult TensorizeAffineForOpConversion::matchAndRewrite(AffineForOp affine auto loadCMap = loadCOp.getAffineMap(); if (cRank < 2) { - return reportMatchFailure(loadCOp.getOperation(), "C array has rank < 2"); + return reportMatchFailure(loadCOp, "C array has rank < 2"); } if (cRank != loadCMap.getNumResults()) { - return reportMatchFailure(loadCOp.getOperation(), "Failed to match the load from C Op"); + return reportMatchFailure(loadCOp, "Failed to match the load from C Op"); } // scan loadCOperands and note which ones are loop vars that refer to GPU block/thread IDs (or are affine expressions of them) @@ -6463,12 +6494,12 @@ LogicalResult TensorizeAffineForOpConversion::matchAndRewrite(AffineForOp affine if (gpuDimsPerDimC.size() != 2) { - return reportMatchFailure(loadCOp.getOperation(), "Failed to match the load from C Op"); + return reportMatchFailure(loadCOp, "Failed to match the load from C Op"); } if (ContainsDim(gpuDimsPerDimC, GPUIndexDimension::Z)) { - return reportMatchFailure(loadCOp.getOperation(), "Failed to match: C op uses GPU Z dimension"); + return reportMatchFailure(loadCOp, "Failed to match: C op uses GPU Z dimension"); } opsToErase.push(loadCOp); @@ -6484,8 +6515,10 @@ LogicalResult TensorizeAffineForOpConversion::matchAndRewrite(AffineForOp affine { return reportMatchFailure(accumC, "Failed to match the accumulation op"); } + // Check that the operands for the addition op are in fact (A*B) and the load from C - if (!((accumC.lhs() == mulAB && accumC.rhs() == loadCOp) || (accumC.rhs() == mulAB && accumC.lhs() == loadCOp))) + auto mulABresolved = (castOp ? castOp : mulAB)->getResult(0); + if (!((accumC.lhs() == mulABresolved && accumC.rhs() == loadCOp) || (accumC.rhs() == mulABresolved && accumC.lhs() == loadCOp))) { return reportMatchFailure(accumC, "Failed to match the accumulation operands"); } diff --git a/accera/transforms/src/gpu/AcceraToGPUPass.cpp b/accera/transforms/src/gpu/AcceraToGPUPass.cpp index 141abaef..891ae5c8 100644 --- a/accera/transforms/src/gpu/AcceraToGPUPass.cpp +++ b/accera/transforms/src/gpu/AcceraToGPUPass.cpp @@ -498,9 +498,7 @@ struct ValueMMALoadSyncOpToRocDLConversion final : public OpConversionPattern intraTileOffsetExprs = { upperLeftCornerRowDim + fullRowOffsetExpr, upperLeftCornerColDim + fullColOffsetExpr }; - auto intraTileOffsetMap = mlir::AffineMap::get(3, 1, intraTileOffsetExprs, rewriter.getContext()); - auto mfmaExternalDimsMap = mlir::AffineMap::getMultiDimIdentityMap(externalIndices, rewriter.getContext()); - auto fullMatrixAccessMap = utilir::ConcatenateAndShiftAffineDimsAndMaps(rewriter, mfmaExternalDimsMap, intraTileOffsetMap); auto loop = rewriter.replaceOpWithNewOp(op, 0, vecSize, 1, vec); auto loopBuilder = utilir::MakeBodyBuilder(loop); auto inductionVar = loop.getInductionVar(); auto destVec = loop.getRegionIterArgs()[0]; + // 1 dim for the induction var, 1 symbol for the warp thread ID + auto warpTileOffsetMap = mlir::AffineMap::get(1, 1, { fullRowOffsetExpr, fullColOffsetExpr }, rewriter.getContext()); + + // Shift the dims in warpTileOffsetMap by the mfma external dim count + auto mfmaExternalDimsMap = mlir::AffineMap::getMultiDimIdentityMap(externalIndices, loopBuilder.getContext()); + warpTileOffsetMap = utilir::ConcatenateAndShiftAffineDimsAndMaps(loopBuilder, mfmaExternalDimsMap, warpTileOffsetMap); + + // The buffer being accessed may hold the tile data in a physical order different from the logical order, + // so use the tileAccessMap map attr if it exists to map how offsets within the logical tile translate + // to offsets within the physical buffer layout + auto tileAccessMapOpt = op.tileAccessMap(); + if (tileAccessMapOpt.hasValue()) + { + auto tileAccessMap = tileAccessMapOpt.getValue(); + warpTileOffsetMap = tileAccessMap.compose(warpTileOffsetMap); + } + + mlir::Value zeroIndex = rewriter.create(loc, 0); + std::vector warpTileOffsetOperands(upperLeftCornerPos.size(), zeroIndex); + warpTileOffsetOperands[warpTileOffsetOperands.size() - 2] = inductionVar; + warpTileOffsetOperands[warpTileOffsetOperands.size() - 1] = warpTidVal; + auto warpTileOffsetPhysicalPos = utilir::MultiDimAffineApply(loopBuilder, loc, warpTileOffsetMap, warpTileOffsetOperands); + + // Now that we have the offsets within the physical buffer for this thread in the warp, add that to the physical upper left corner position + // To get the full position in the memref + + std::vector matrixAccessExprs; + for (size_t i = 0; i < upperLeftCornerPos.size(); ++i) + { + matrixAccessExprs.emplace_back(loopBuilder.getAffineDimExpr(i) + loopBuilder.getAffineDimExpr(i + upperLeftCornerPos.size())); + } + + auto fullMatrixAccessMap = mlir::AffineMap::get(2 * upperLeftCornerPos.size(), 0, matrixAccessExprs, loopBuilder.getContext()); + std::vector accessOperands(upperLeftCornerPos); - accessOperands.push_back(inductionVar); - accessOperands.push_back(warpTidVal); + accessOperands.insert(accessOperands.end(), warpTileOffsetPhysicalPos.begin(), warpTileOffsetPhysicalPos.end()); + auto accessPos = utilir::MultiDimAffineApply(loopBuilder, loc, fullMatrixAccessMap, accessOperands); auto load = loopBuilder.create(loc, memref, accessPos); diff --git a/accera/utilities/include/TypeTraits.h b/accera/utilities/include/TypeTraits.h index 17c9c888..d963a28c 100644 --- a/accera/utilities/include/TypeTraits.h +++ b/accera/utilities/include/TypeTraits.h @@ -6,6 +6,7 @@ #pragma once +#include #include #include diff --git a/accera/value/include/EmitterContext.h b/accera/value/include/EmitterContext.h index d8838851..3e91e869 100644 --- a/accera/value/include/EmitterContext.h +++ b/accera/value/include/EmitterContext.h @@ -222,6 +222,9 @@ namespace value /// An instance of Value that contains a reference to the allocated memory Value StoreConstantData(ConstantData data, MemoryLayout layout, const std::string& name); + /// Returns true or false depending on whether the data stored is known to be a constant + bool IsConstantData(Value) const; + /// Makes a reference to a constant data source. This data source may originate from another context /// The constant data source that will be referenced from this context /// An instance of Value that contains a reference to the source Value @@ -349,8 +352,6 @@ namespace value Scalar Cast(Scalar value, ValueType type); - Scalar UnsignedCast(Scalar value, ValueType type); - Scalar Bitcast(Scalar value, ValueType type); IfContext If(Scalar test, std::function fn); @@ -444,6 +445,7 @@ namespace value virtual bool IsFunctionDefinedImpl(FunctionDeclaration decl) const = 0; virtual Value StoreConstantDataImpl(ConstantData data, MemoryLayout layout, const std::string& name) = 0; + virtual bool IsConstantDataImpl(Value) const = 0; virtual Value ResolveConstantDataReferenceImpl(Value constantDataSource) = 0; virtual void ForImpl(MemoryLayout layout, std::function)> fn, const std::string& name) = 0; @@ -481,8 +483,6 @@ namespace value virtual Scalar CastImpl(Scalar value, ValueType type) = 0; - virtual Scalar UnsignedCastImpl(Scalar value, ValueType type) = 0; - virtual Scalar BitcastImpl(Scalar value, ValueType type) = 0; virtual IfContext IfImpl(Scalar test, std::function fn) = 0; diff --git a/accera/value/include/MLIREmitterContext.h b/accera/value/include/MLIREmitterContext.h index 273dd2fa..f3f68b16 100644 --- a/accera/value/include/MLIREmitterContext.h +++ b/accera/value/include/MLIREmitterContext.h @@ -135,6 +135,7 @@ namespace value bool IsFunctionDefinedImpl(FunctionDeclaration decl) const override; Value StoreConstantDataImpl(ConstantData data, MemoryLayout layout, const std::string& name) override; + bool IsConstantDataImpl(Value) const override; Value ResolveConstantDataReferenceImpl(Value constantDataSource) override; void ForImpl(MemoryLayout layout, std::function)> fn, const std::string& name) override; @@ -171,11 +172,8 @@ namespace value void MMAStoreSyncImpl(const MatrixFragment& source, Matrix& target, const int64_t rowOffset, const int64_t colOffset) override; Value MMAComputeSyncImpl(const MatrixFragment& A, const MatrixFragment& B, const MatrixFragment& C, uint32_t cbsz, uint32_t abid, uint32_t blgp) override; - Scalar CastImpl(Scalar value, ValueType type, bool doSignedCast); Scalar CastImpl(Scalar value, ValueType type) override; - Scalar UnsignedCastImpl(Scalar value, ValueType type) override; - Scalar BitcastImpl(Scalar value, ValueType type) override; IfContext IfImpl(Scalar test, std::function fn) override; diff --git a/accera/value/include/Scalar.h b/accera/value/include/Scalar.h index 565488fa..ac9b8fd6 100644 --- a/accera/value/include/Scalar.h +++ b/accera/value/include/Scalar.h @@ -76,6 +76,10 @@ namespace value /// The type ValueType GetType() const; + // Returns [s1, s2] with either s1 casted to s2's type or s2 casted to s1's type or unchanged if + // there is no implicit type conversion that can be done. + static std::pair MakeTypeCompatible(Scalar s1, Scalar s2); + private: friend Scalar operator+(Scalar, Scalar); friend Scalar operator*(Scalar, Scalar); @@ -110,7 +114,6 @@ namespace value { return MakeScalar(GetValueType(), name); } - } // namespace value } // namespace accera diff --git a/accera/value/include/Value.h b/accera/value/include/Value.h index 447889f9..07fbe5c0 100644 --- a/accera/value/include/Value.h +++ b/accera/value/include/Value.h @@ -27,6 +27,12 @@ namespace accera { namespace value { + class TypeMismatchException : public utilities::GenericException + { + public: + TypeMismatchException(ValueType expected, ValueType actual) : + GenericException(ToString(actual) + " type is incompatible with " + ToString(expected)) {} + }; class Value; class Scalar; @@ -305,7 +311,7 @@ namespace value /// Returns true if the instance is undefined bool IsUndefined() const; - /// Returns true if the instance holds data + /// Returns true if the instance does not hold data bool IsEmpty() const; /// Returns true if the instance holds constant data @@ -375,6 +381,9 @@ namespace value /// Clear the data, if any, on this instance void ClearData(); + /// Resets this instance + void Clear(); + /// Returns the number of pointer indirections on the data referred to by this instance /// The number of pointer indirections int PointerLevel() const; @@ -495,6 +504,8 @@ namespace value return Cast(value, GetValueType()); } + bool IsImplicitlyCastable(ViewAdapter v1, ViewAdapter v2); + } // namespace value } // namespace accera diff --git a/accera/value/include/ValueType.h b/accera/value/include/ValueType.h index 3a4a4320..ccbbbff9 100644 --- a/accera/value/include/ValueType.h +++ b/accera/value/include/ValueType.h @@ -171,6 +171,61 @@ namespace value } } + constexpr inline bool IsSignedType(ValueType t) + { + switch (t) + { + case ValueType::BFloat16: + [[fallthrough]]; + case ValueType::Float16: + [[fallthrough]]; + case ValueType::Float: + [[fallthrough]]; + case ValueType::Double: + [[fallthrough]]; + case ValueType::Int8: + [[fallthrough]]; + case ValueType::Int16: + [[fallthrough]]; + case ValueType::Int32: + [[fallthrough]]; + case ValueType::Int64: + return true; + default: + return false; + } + } + + constexpr inline bool IsUnsignedType(ValueType t) + { + switch (t) + { + case ValueType::Boolean: + [[fallthrough]]; + case ValueType::Byte: + [[fallthrough]]; + case ValueType::Uint16: + [[fallthrough]]; + case ValueType::Uint32: + [[fallthrough]]; + case ValueType::Uint64: + return true; + default: + return false; + } + } + + constexpr inline bool IsSignlessType(ValueType t) + { + switch (t) + { + case ValueType::Index: + return true; + default: + return false; + } + } + /// Get a string representation of the enum value std::string ToString(ValueType t); ValueType FromString(std::string name); diff --git a/accera/value/src/EmitterContext.cpp b/accera/value/src/EmitterContext.cpp index 874035cb..3729166e 100644 --- a/accera/value/src/EmitterContext.cpp +++ b/accera/value/src/EmitterContext.cpp @@ -139,6 +139,15 @@ namespace value Value EmitterContext::StoreConstantData(ConstantData data, MemoryLayout layout, const std::string& name) { return StoreConstantDataImpl(data, layout, name); } + bool EmitterContext::IsConstantData(Value v) const + { + if (!v.IsDefined() || v.IsEmpty()) return false; + + if (!std::holds_alternative(v.GetUnderlyingData())) return true; + + return IsConstantDataImpl(v); + } + Value EmitterContext::ResolveConstantDataReference(Value source) { return ResolveConstantDataReferenceImpl(source); } void EmitterContext::For(MemoryLayout layout, std::function)> fn, const std::string& name) @@ -251,11 +260,6 @@ namespace value return CastImpl(value, type); } - Scalar EmitterContext::UnsignedCast(Scalar value, ValueType type) - { - return UnsignedCastImpl(value, type); - } - Scalar EmitterContext::Bitcast(Scalar value, ValueType type) { return BitcastImpl(value, type); diff --git a/accera/value/src/MLIREmitterContext.cpp b/accera/value/src/MLIREmitterContext.cpp index ab5dd3ac..ba9468cf 100644 --- a/accera/value/src/MLIREmitterContext.cpp +++ b/accera/value/src/MLIREmitterContext.cpp @@ -6,6 +6,7 @@ #include "MLIREmitterContext.h" #include "CompilerOptions.h" +#include "ValueType.h" #include #include @@ -17,6 +18,7 @@ #include #include +#include #include #include @@ -302,7 +304,7 @@ mlir::FunctionType ToMLIRType(mlir::OpBuilder& builder, const FunctionDeclaratio [[nodiscard]] mlir::Value ToMLIRValue(mlir::OpBuilder& builder, const ViewAdapter& view) { auto value = view.GetValue(); - if (value.IsEmpty() || value.IsUndefined() || value.IsConstant()) + if (value.IsEmpty() || value.IsUndefined()) { return {}; } @@ -1449,6 +1451,19 @@ Value MLIRContext::StoreConstantDataImpl(ConstantData data, MemoryLayout layout, return Value(emittable, layout); } +bool MLIRContext::IsConstantDataImpl(Value v) const +{ + auto data = Unwrap(v); + + // TODO: Extend this check to handle constant data arrays. Right now, this only works for scalar values + if (llvm::isa_and_nonnull(data.getDefiningOp())) + { + return true; + } + + return false; +} + Value MLIRContext::ResolveConstantDataReferenceImpl(Value constantDataSource) { auto sourceRefGlobalOp = mlir::Value::getFromOpaquePointer(constantDataSource.Get().GetDataAs()->data).getDefiningOp(); @@ -2013,7 +2028,7 @@ Value MLIRContext::MMAComputeSyncImpl(const MatrixFragment& A, const MatrixFragm return Value(emittable, C.GetValue().GetLayout()); } -Scalar MLIRContext::CastImpl(Scalar value, ValueType type, bool doSignedCast) +Scalar MLIRContext::CastImpl(Scalar value, ValueType type) { auto& builder = _impl->builder; mlir::Value mlirValue = ResolveMLIRScalar(builder, ToMLIRValue(builder, value)); @@ -2027,6 +2042,8 @@ Scalar MLIRContext::CastImpl(Scalar value, ValueType type, bool doSignedCast) return Wrap(mlirValue); } + auto doSignedCast = IsSignedType(type); + return mlir::TypeSwitch(fromType) .Case([&](mlir::IntegerType fromIntType) { auto signlessMlirValue = accera::ir::util::ToSignlessMLIRValue(builder, mlirValue); @@ -2034,22 +2051,31 @@ Scalar MLIRContext::CastImpl(Scalar value, ValueType type, bool doSignedCast) return mlir::TypeSwitch(toType) .Case([&](mlir::IntegerType toIntType) { auto toIntTypeSignless = accera::ir::util::ToSignlessMLIRType(builder, toIntType); + + mlir::Value casted; if (fromIntType.getWidth() == toIntType.getWidth()) { - return Wrap(signlessMlirValue); + // do nothing } else if (fromIntType.getWidth() > toIntType.getWidth()) { - return Wrap(builder.create(loc, signlessMlirValue, toIntTypeSignless)); - } - else if (doSignedCast || !fromIntType.isUnsigned()) - { - return Wrap(builder.create(loc, signlessMlirValue, toIntTypeSignless)); + + signlessMlirValue = builder.create(loc, signlessMlirValue, toIntTypeSignless); } else { - return Wrap(builder.create(loc, signlessMlirValue, toIntTypeSignless)); + if (doSignedCast) + { + signlessMlirValue = builder.create(loc, signlessMlirValue, toIntTypeSignless); + } + else + { + signlessMlirValue = builder.create(loc, signlessMlirValue, toIntTypeSignless); + } } + + casted = builder.create(signlessMlirValue.getLoc(), toType, signlessMlirValue)->getResult(0); + return Wrap(casted); }) .Case([&](mlir::IndexType) { return Wrap(builder.create(loc, signlessMlirValue, toType)); @@ -2057,10 +2083,8 @@ Scalar MLIRContext::CastImpl(Scalar value, ValueType type, bool doSignedCast) .Case([&](mlir::FloatType) { return fromIntType.isUnsigned() ? Wrap(builder.create(loc, signlessMlirValue, toType)) : Wrap(builder.create(loc, signlessMlirValue, toType)); }) - .Default([&](mlir::Type) { + .Default([&](mlir::Type) -> Scalar { throw utilities::LogicException(utilities::LogicExceptionErrors::notImplemented, __FILE__ " : " + std::to_string(__LINE__)); - llvm_unreachable("unexpected"); - return Scalar(); }); }) .Case([&](mlir::IndexType) { @@ -2073,10 +2097,8 @@ Scalar MLIRContext::CastImpl(Scalar value, ValueType type, bool doSignedCast) auto int64Value = builder.create(loc, mlirValue, builder.getI64Type()); // index->int64 return Wrap(builder.create(loc, int64Value, toType)); // int64->fp }) - .Default([&](mlir::Type) { + .Default([&](mlir::Type) -> Scalar { throw utilities::LogicException(utilities::LogicExceptionErrors::notImplemented, __FILE__ " : " + std::to_string(__LINE__)); - llvm_unreachable("unexpected"); - return Scalar(); }); }) .Case([&](mlir::FloatType fromFloatType) { @@ -2088,29 +2110,15 @@ Scalar MLIRContext::CastImpl(Scalar value, ValueType type, bool doSignedCast) .Case([&](mlir::FloatType toFloatType) { return fromFloatType.getWidth() > toFloatType.getWidth() ? Wrap(builder.create(loc, mlirValue, toType)) : Wrap(builder.create(loc, mlirValue, toType)); }) - .Default([&](mlir::Type) { + .Default([&](mlir::Type) -> Scalar { throw utilities::LogicException(utilities::LogicExceptionErrors::notImplemented, __FILE__ " : " + std::to_string(__LINE__)); - llvm_unreachable("unexpected"); - return Scalar(); }); }) - .Default([&](mlir::Type) { + .Default([&](mlir::Type) -> Scalar { throw utilities::LogicException(utilities::LogicExceptionErrors::notImplemented, __FILE__ " : " + std::to_string(__LINE__)); - llvm_unreachable("unexpected"); - return Scalar(); }); } -Scalar MLIRContext::CastImpl(Scalar value, ValueType type) -{ - return CastImpl(value, type, /*doSignedCast=*/true); -} - -Scalar MLIRContext::UnsignedCastImpl(Scalar value, ValueType type) -{ - return CastImpl(value, type, /*doSignedCast=*/false); -} - Scalar MLIRContext::BitcastImpl(Scalar value, ValueType type) { auto& builder = _impl->builder; diff --git a/accera/value/src/Scalar.cpp b/accera/value/src/Scalar.cpp index 5d8803a7..737eb646 100644 --- a/accera/value/src/Scalar.cpp +++ b/accera/value/src/Scalar.cpp @@ -5,7 +5,10 @@ //////////////////////////////////////////////////////////////////////////////////////////////////// #include "Scalar.h" +#include "Emittable.h" #include "EmitterContext.h" +#include "ScalarOperations.h" +#include "Value.h" #include "ValueType.h" #include @@ -17,42 +20,6 @@ namespace accera { namespace value { - namespace - { - template - constexpr bool ItemIsOneOf(T&& t, C&& c) - { - return llvm::any_of(c, [=](auto arg) { return t == arg; }); - } - - bool IsImplcitTypeCastable(ValueType source, ValueType target) - { -#define MAP_TARGET_TO_POSSIBLE_SOURCES(TARGET, ...) \ - case TARGET: \ - return ItemIsOneOf(source, std::initializer_list{ __VA_ARGS__ }) - - switch (target) - { - MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Int8, ValueType::Boolean); - MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Byte, ValueType::Boolean, ValueType::Int8); - MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Int16, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Uint16); - MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Uint16, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16); - MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Int32, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Uint32); - MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Uint32, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32); - MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Int64, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32, ValueType::Uint32, ValueType::Uint64); - MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Uint64, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32, ValueType::Uint32, ValueType::Int64); - MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Float16, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16); - MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::BFloat16, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16); - MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Float, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32, ValueType::Uint32, ValueType::Int64, ValueType::Uint64, ValueType::Float16, ValueType::BFloat16); - MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Double, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32, ValueType::Uint32, ValueType::Int64, ValueType::Uint64, ValueType::Float16, ValueType::BFloat16, ValueType::Float); - - default: - return false; - } - -#undef MAP_TARGET_TO_POSSIBLE_SOURCES - } - } // namespace using namespace utilities; Scalar::Scalar() = default; @@ -81,10 +48,16 @@ namespace value { if (this != &other) { - if (GetType() != other.GetType() && IsImplcitTypeCastable(other.GetType(), GetType())) + auto e1 = _value.TryGet(); + auto e2 = other._value.TryGet(); + if (e1 && e2 && e1->GetDataAs() == e2->GetDataAs()) + { + return *this; + } + if (GetType() != other.GetType() && IsImplicitlyCastable(other, *this)) { Scalar castedScalar = Cast(other, GetType()); - _value = castedScalar._value; + GetContext().CopyData(castedScalar._value, _value); } else { @@ -98,21 +71,30 @@ namespace value { if (this != &other) { - if (GetType() != other.GetType() && IsImplcitTypeCastable(other.GetType(), GetType())) + auto e1 = _value.TryGet(); + auto e2 = other._value.TryGet(); + if (e1 && e2 && e1->GetDataAs() == e2->GetDataAs()) + { + return *this; + } + if (GetType() != other.GetType() && IsImplicitlyCastable(other, *this)) { Scalar castedScalar = Cast(other, GetType()); - _value = std::move(castedScalar._value); + GetContext().MoveData(castedScalar._value, _value); } else { _value = std::move(other._value); } - other._value = Value(); + other._value.Clear(); } return *this; } - Value Scalar::GetValue() const { return _value; } + Value Scalar::GetValue() const + { + return _value; + } Scalar Scalar::Copy() const { @@ -121,42 +103,185 @@ namespace value return s; } - ValueType Scalar::GetType() const { return _value.GetBaseType(); } + ValueType Scalar::GetType() const + { + return _value.GetBaseType(); + } - void Scalar::SetName(const std::string& name) { _value.SetName(name); } + void Scalar::SetName(const std::string& name) + { + _value.SetName(name); + } - std::string Scalar::GetName() const { return _value.GetName(); } + std::string Scalar::GetName() const + { + return _value.GetName(); + } Scalar& Scalar::operator+=(Scalar s) { - _value = GetContext().BinaryOperation(ValueBinaryOperation::add, _value, s._value); + Scalar rhs; + if (s.GetType() != GetType()) + { + if (IsImplicitlyCastable(s, *this)) + { + rhs = Cast(s, GetType()); + } + else + { + throw TypeMismatchException(GetType(), s.GetType()); + } + } + else + { + rhs = s; + } + + _value = GetContext().BinaryOperation(ValueBinaryOperation::add, _value, rhs._value); return *this; } Scalar& Scalar::operator-=(Scalar s) { - _value = GetContext().BinaryOperation(ValueBinaryOperation::subtract, _value, s._value); + Scalar rhs; + if (s.GetType() != GetType()) + { + if (IsImplicitlyCastable(s, *this)) + { + rhs = Cast(s, GetType()); + } + else + { + throw TypeMismatchException(GetType(), s.GetType()); + } + } + else + { + rhs = s; + } + + _value = GetContext().BinaryOperation(ValueBinaryOperation::subtract, _value, rhs._value); + // auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(*this, s); + + // _value = GetContext().BinaryOperation(ValueBinaryOperation::subtract, lhs._value, rhs._value); return *this; } Scalar& Scalar::operator*=(Scalar s) { - _value = GetContext().BinaryOperation(ValueBinaryOperation::multiply, _value, s._value); + Scalar rhs; + if (s.GetType() != GetType()) + { + if (IsImplicitlyCastable(s, *this)) + { + rhs = Cast(s, GetType()); + } + else + { + throw TypeMismatchException(GetType(), s.GetType()); + } + } + else + { + rhs = s; + } + + _value = GetContext().BinaryOperation(ValueBinaryOperation::multiply, _value, rhs._value); + // auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(*this, s); + + // _value = GetContext().BinaryOperation(ValueBinaryOperation::multiply, lhs._value, rhs._value); return *this; } Scalar& Scalar::operator/=(Scalar s) { - _value = GetContext().BinaryOperation(ValueBinaryOperation::divide, _value, s._value); + Scalar rhs; + if (s.GetType() != GetType()) + { + if (IsImplicitlyCastable(s, *this)) + { + rhs = Cast(s, GetType()); + } + else + { + throw TypeMismatchException(GetType(), s.GetType()); + } + } + else + { + rhs = s; + } + + _value = GetContext().BinaryOperation(ValueBinaryOperation::divide, _value, rhs._value); + // auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(*this, s); + + // _value = GetContext().BinaryOperation(ValueBinaryOperation::divide, lhs._value, rhs._value); return *this; } Scalar& Scalar::operator%=(Scalar s) { - _value = GetContext().BinaryOperation(ValueBinaryOperation::modulus, _value, s._value); + Scalar rhs; + if (s.GetType() != GetType()) + { + if (IsImplicitlyCastable(s, *this)) + { + rhs = Cast(s, GetType()); + } + else + { + throw TypeMismatchException(GetType(), s.GetType()); + } + } + else + { + rhs = s; + } + + _value = GetContext().BinaryOperation(ValueBinaryOperation::modulus, _value, rhs._value); + // auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(*this, s); + + // _value = GetContext().BinaryOperation(ValueBinaryOperation::modulus, lhs._value, rhs._value); return *this; } + std::pair Scalar::MakeTypeCompatible(Scalar s1, Scalar s2) + { + auto s1Type = s1.GetType(); + auto s2Type = s2.GetType(); + + if (s1Type == s2Type) + { + return { s1, s2 }; + } + + assert((!s1.IsConstant() || !s2.IsConstant()) && "Unexpected scenario"); + + Scalar newS1, newS2; + if (s1.IsConstant()) + { + newS1 = Cast(s1, s2Type); + newS2 = s2; + } + else if (s2.IsConstant()) + { + newS1 = s1; + newS2 = Cast(s2, s1Type); + } + else if (IsImplicitlyCastable(s1, s2)) + { + newS1 = Cast(s1, s2Type); + newS2 = s2; + } + else if (IsImplicitlyCastable(s2, s1)) + { + newS1 = s1; + newS2 = Cast(s2, s1Type); + } + + return { newS1, newS2 }; + } + // Free function operator overloads Scalar operator+(Scalar s1, Scalar s2) { @@ -234,32 +359,44 @@ namespace value Scalar operator==(Scalar s1, Scalar s2) { - return GetContext().LogicalOperation(ValueLogicalOperation::equality, s1.GetValue(), s2.GetValue()); + auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(s1, s2); + + return GetContext().LogicalOperation(ValueLogicalOperation::equality, lhs.GetValue(), rhs.GetValue()); } Scalar operator!=(Scalar s1, Scalar s2) { - return GetContext().LogicalOperation(ValueLogicalOperation::inequality, s1.GetValue(), s2.GetValue()); + auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(s1, s2); + + return GetContext().LogicalOperation(ValueLogicalOperation::inequality, lhs.GetValue(), rhs.GetValue()); } Scalar operator<=(Scalar s1, Scalar s2) { - return GetContext().LogicalOperation(ValueLogicalOperation::lessthanorequal, s1.GetValue(), s2.GetValue()); + auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(s1, s2); + + return GetContext().LogicalOperation(ValueLogicalOperation::lessthanorequal, lhs.GetValue(), rhs.GetValue()); } Scalar operator<(Scalar s1, Scalar s2) { - return GetContext().LogicalOperation(ValueLogicalOperation::lessthan, s1.GetValue(), s2.GetValue()); + auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(s1, s2); + + return GetContext().LogicalOperation(ValueLogicalOperation::lessthan, lhs.GetValue(), rhs.GetValue()); } Scalar operator>=(Scalar s1, Scalar s2) { - return GetContext().LogicalOperation(ValueLogicalOperation::greaterthanorequal, s1.GetValue(), s2.GetValue()); + auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(s1, s2); + + return GetContext().LogicalOperation(ValueLogicalOperation::greaterthanorequal, lhs.GetValue(), rhs.GetValue()); } Scalar operator>(Scalar s1, Scalar s2) { - return GetContext().LogicalOperation(ValueLogicalOperation::greaterthan, s1.GetValue(), s2.GetValue()); + auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(s1, s2); + + return GetContext().LogicalOperation(ValueLogicalOperation::greaterthan, lhs.GetValue(), rhs.GetValue()); } Scalar operator&&(Scalar s1, Scalar s2) @@ -268,8 +405,9 @@ namespace value { throw LogicException(LogicExceptionErrors::illegalState); } + auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(s1, s2); - return GetContext().BinaryOperation(ValueBinaryOperation::logicalAnd, s1.GetValue(), s2.GetValue()); + return GetContext().BinaryOperation(ValueBinaryOperation::logicalAnd, lhs.GetValue(), rhs.GetValue()); } Scalar operator||(Scalar s1, Scalar s2) @@ -278,8 +416,9 @@ namespace value { throw LogicException(LogicExceptionErrors::illegalState); } + auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(s1, s2); - return GetContext().BinaryOperation(ValueBinaryOperation::logicalOr, s1.GetValue(), s2.GetValue()); + return GetContext().BinaryOperation(ValueBinaryOperation::logicalOr, lhs.GetValue(), rhs.GetValue()); } Scalar MakeScalar(ValueType type, const std::string&) diff --git a/accera/value/src/ScalarOperations.cpp b/accera/value/src/ScalarOperations.cpp index 5babd398..6506974a 100644 --- a/accera/value/src/ScalarOperations.cpp +++ b/accera/value/src/ScalarOperations.cpp @@ -13,6 +13,7 @@ #include #include #include +#include namespace accera { @@ -96,31 +97,36 @@ namespace value Scalar Add(Scalar s1, Scalar s2) { Scalar copy = s1.Copy(); - return copy += s2; + auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(copy, s2); + return lhs += rhs; } Scalar Subtract(Scalar s1, Scalar s2) { Scalar copy = s1.Copy(); - return copy -= s2; + auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(copy, s2); + return lhs -= rhs; } Scalar Multiply(Scalar s1, Scalar s2) { Scalar copy = s1.Copy(); - return copy *= s2; + auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(copy, s2); + return lhs *= rhs; } Scalar Divide(Scalar s1, Scalar s2) { Scalar copy = s1.Copy(); - return copy /= s2; + auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(copy, s2); + return lhs /= rhs; } Scalar Modulo(Scalar s1, Scalar s2) { Scalar copy = s1.Copy(); - return copy %= s2; + auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(copy, s2); + return lhs %= rhs; } Scalar FusedMultiplyAdd(Scalar a, Scalar b, Scalar c) @@ -194,16 +200,24 @@ namespace value Scalar Max(Scalar s1, Scalar s2) { + std::tie(s1, s2) = Scalar::MakeTypeCompatible(s1, s2); + return Select(s1 > s2, s1, s2); } Scalar Min(Scalar s1, Scalar s2) { + std::tie(s1, s2) = Scalar::MakeTypeCompatible(s1, s2); + return Select(s1 < s2, s1, s2); } Scalar Clamp(Scalar s, Scalar min, Scalar max) { + std::tie(min, max) = Scalar::MakeTypeCompatible(min, max); + std::tie(s, min) = Scalar::MakeTypeCompatible(s, min); + std::tie(s, max) = Scalar::MakeTypeCompatible(s, max); + return Min(max, Max(s, min)); } @@ -220,6 +234,7 @@ namespace value Scalar Select(Scalar cmp, Scalar a, Scalar b) { + std::tie(a, b) = Scalar::MakeTypeCompatible(a, b); return ScalarOpBuilder(cmp, a, b); } diff --git a/accera/value/src/Value.cpp b/accera/value/src/Value.cpp index 498a537f..ba094216 100644 --- a/accera/value/src/Value.cpp +++ b/accera/value/src/Value.cpp @@ -6,6 +6,7 @@ #include "Value.h" #include "EmitterContext.h" +#include "ValueType.h" #include #include @@ -177,7 +178,6 @@ namespace value { _type = other._type; } - GetContext().MoveData(other, *this); } } @@ -213,7 +213,7 @@ namespace value void Value::SetData(Value value, bool force) { - if (!force && value.IsConstrained() && value.GetLayout() != GetLayout()) + if (!force && IsConstrained() && value.IsConstrained() && value.GetLayout() != GetLayout()) { throw InputException(InputExceptionErrors::invalidArgument); } @@ -226,10 +226,7 @@ namespace value } _data = emittable; - if (!force) - { - _type = type; - } + _type = type; }, [this, force](auto&& arg) { if (!force && GetValueType>() != _type.first) @@ -242,9 +239,15 @@ namespace value value._data); } - bool Value::IsDefined() const { return _type.first != ValueType::Undefined; } + bool Value::IsDefined() const + { + return _type.first != ValueType::Undefined; + } - bool Value::IsUndefined() const { return !IsDefined(); } + bool Value::IsUndefined() const + { + return !IsDefined(); + } bool Value::IsEmpty() const { @@ -255,7 +258,10 @@ namespace value _data); } - bool Value::IsConstant() const { return !std::holds_alternative(_data); } + bool Value::IsConstant() const + { + return GetContext().IsConstantData(*this); + } bool Value::IsIntegral() const { @@ -286,54 +292,128 @@ namespace value } } - bool Value::IsBoolean() const { return _type.first == ValueType::Boolean; } + bool Value::IsBoolean() const + { + return _type.first == ValueType::Boolean; + } - bool Value::IsByte() const { return _type.first == ValueType::Byte; } + bool Value::IsByte() const + { + return _type.first == ValueType::Byte; + } - bool Value::IsInt8() const { return _type.first == ValueType::Int8; } + bool Value::IsInt8() const + { + return _type.first == ValueType::Int8; + } - bool Value::IsInt16() const { return _type.first == ValueType::Int16; } + bool Value::IsInt16() const + { + return _type.first == ValueType::Int16; + } - bool Value::IsInt32() const { return _type.first == ValueType::Int32; } + bool Value::IsInt32() const + { + return _type.first == ValueType::Int32; + } - bool Value::IsInt64() const { return _type.first == ValueType::Int64; } + bool Value::IsInt64() const + { + return _type.first == ValueType::Int64; + } - bool Value::IsUint16() const { return _type.first == ValueType::Uint16; } + bool Value::IsUint16() const + { + return _type.first == ValueType::Uint16; + } - bool Value::IsUint32() const { return _type.first == ValueType::Uint32; } + bool Value::IsUint32() const + { + return _type.first == ValueType::Uint32; + } - bool Value::IsUint64() const { return _type.first == ValueType::Uint64; } + bool Value::IsUint64() const + { + return _type.first == ValueType::Uint64; + } - bool Value::IsIndex() const { return _type.first == ValueType::Index; } + bool Value::IsIndex() const + { + return _type.first == ValueType::Index; + } bool Value::IsFloatingPoint() const { return (_type.first == ValueType::Float16 || _type.first == ValueType::Float || _type.first == ValueType::Double || _type.first == ValueType::BFloat16); } - bool Value::IsFloat16() const { return _type.first == ValueType::Float16; } + bool Value::IsFloat16() const + { + return _type.first == ValueType::Float16; + } - bool Value::IsFloat32() const { return _type.first == ValueType::Float; } + bool Value::IsFloat32() const + { + return _type.first == ValueType::Float; + } - bool Value::IsDouble() const { return _type.first == ValueType::Double; } + bool Value::IsDouble() const + { + return _type.first == ValueType::Double; + } - bool Value::IsConstrained() const { return _layout.has_value(); } + bool Value::IsConstrained() const + { + return _layout.has_value(); + } - const MemoryLayout& Value::GetLayout() const { return _layout.value(); } + const MemoryLayout& Value::GetLayout() const + { + return _layout.value(); + } - ValueType Value::GetBaseType() const { return _type.first; } + ValueType Value::GetBaseType() const + { + return _type.first; + } - void Value::SetLayout(MemoryLayout layout) { GetContext().SetLayout(*this, layout); } + void Value::SetLayout(MemoryLayout layout) + { + GetContext().SetLayout(*this, layout); + } - void Value::ClearLayout() { _layout.reset(); } + void Value::ClearLayout() + { + _layout.reset(); + } - void Value::ClearData() { _data = Emittable{ nullptr }; } + void Value::ClearData() + { + _data = Emittable{ nullptr }; + } - int Value::PointerLevel() const { return _type.second; } + void Value::Clear() + { + ClearData(); + ClearLayout(); + _type = {}; + _hasName = false; + } - Value::UnderlyingDataType& Value::GetUnderlyingData() { return _data; } + int Value::PointerLevel() const + { + return _type.second; + } - const Value::UnderlyingDataType& Value::GetUnderlyingData() const { return _data; } + Value::UnderlyingDataType& Value::GetUnderlyingData() + { + return _data; + } + + const Value::UnderlyingDataType& Value::GetUnderlyingData() const + { + return _data; + } void Value::SetName(const std::string& name) { @@ -346,7 +426,10 @@ namespace value return GetContext().GetName(*this); } - bool Value::HasCustomName() const { return _hasName; } + bool Value::HasCustomName() const + { + return _hasName; + } namespace detail { @@ -417,6 +500,46 @@ namespace value return ValueType::Undefined; } + + namespace + { + template + constexpr bool ItemIsOneOf(T&& t, C&& c) + { + return llvm::any_of(c, [=](auto arg) { return t == arg; }); + } + + } // namespace + + bool IsImplicitlyCastable(ViewAdapter v1, ViewAdapter v2) + { + auto source = v1.GetValue().GetBaseType(); + auto target = v2.GetValue().GetBaseType(); + +#define MAP_TARGET_TO_POSSIBLE_SOURCES(TARGET, ...) \ + case TARGET: \ + return ItemIsOneOf(source, std::initializer_list{ __VA_ARGS__ }) + + switch (target) + { + MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Int8, ValueType::Boolean, ValueType::Byte); + MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Byte, ValueType::Boolean, ValueType::Int8); + MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Int16, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Uint16); + MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Uint16, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16); + MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Int32, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Uint32); + MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Uint32, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32); + MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Int64, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32, ValueType::Uint32, ValueType::Uint64); + MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Uint64, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32, ValueType::Uint32, ValueType::Int64); + MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Float16, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16); + MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::BFloat16, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16); + MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Float, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32, ValueType::Uint32, ValueType::Int64, ValueType::Uint64, ValueType::Float16, ValueType::BFloat16); + MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Double, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32, ValueType::Uint32, ValueType::Int64, ValueType::Uint64, ValueType::Float16, ValueType::BFloat16, ValueType::Float); + MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Index, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32, ValueType::Uint32, ValueType::Int64, ValueType::Uint64); + + default: + return false; + } + } } // namespace value } // namespace accera diff --git a/accera/value/src/ValueOperations.cpp b/accera/value/src/ValueOperations.cpp index e2ef7233..e459ed03 100644 --- a/accera/value/src/ValueOperations.cpp +++ b/accera/value/src/ValueOperations.cpp @@ -70,15 +70,5 @@ namespace value return GetContext().Cast(value, type); } - Scalar UnsignedCast(Scalar value, ValueType type) - { - if (value.GetType() == type) - { - return value; - } - - return GetContext().UnsignedCast(value, type); - } - } // namespace value } // namespace accera diff --git a/docs/.bumpversion.cfg b/docs/.bumpversion.cfg index 25c610e8..8f395e6c 100644 --- a/docs/.bumpversion.cfg +++ b/docs/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.2.6 +current_version = 1.2.7 [bumpversion:glob:**/*.md] search = Version: v{current_version} diff --git a/docs/Case Studies/CONTRIBUTING.md b/docs/Case Studies/CONTRIBUTING.md index 1b4d4a12..134cf155 100644 --- a/docs/Case Studies/CONTRIBUTING.md +++ b/docs/Case Studies/CONTRIBUTING.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) # Contributing Guide diff --git a/docs/Case Studies/README.md b/docs/Case Studies/README.md index aed5655d..51318707 100644 --- a/docs/Case Studies/README.md +++ b/docs/Case Studies/README.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) # Accera Case Studies diff --git a/docs/Install/Building_on_MacOS.md b/docs/Install/Building_on_MacOS.md index 5d9cfd2e..a3eb90fc 100644 --- a/docs/Install/Building_on_MacOS.md +++ b/docs/Install/Building_on_MacOS.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) ## Installing on MacOS diff --git a/docs/Install/Building_on_Ubuntu.md b/docs/Install/Building_on_Ubuntu.md index 9a69f660..80ca2a4e 100644 --- a/docs/Install/Building_on_Ubuntu.md +++ b/docs/Install/Building_on_Ubuntu.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) ## Installing on Ubuntu diff --git a/docs/Install/Building_on_Windows.md b/docs/Install/Building_on_Windows.md index 27bd5420..e37622f3 100644 --- a/docs/Install/Building_on_Windows.md +++ b/docs/Install/Building_on_Windows.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) ## Installing on Windows diff --git a/docs/Install/Installing_Accera_on_MacOS.md b/docs/Install/Installing_Accera_on_MacOS.md index 21d229d4..51fef805 100644 --- a/docs/Install/Installing_Accera_on_MacOS.md +++ b/docs/Install/Installing_Accera_on_MacOS.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) ## Installing on MacOS diff --git a/docs/Install/Installing_Accera_on_Ubuntu.md b/docs/Install/Installing_Accera_on_Ubuntu.md index 4c954e39..0355e46f 100644 --- a/docs/Install/Installing_Accera_on_Ubuntu.md +++ b/docs/Install/Installing_Accera_on_Ubuntu.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) ## Installing on Ubuntu diff --git a/docs/Install/Installing_Accera_on_Windows.md b/docs/Install/Installing_Accera_on_Windows.md index 68fe7885..174d520b 100644 --- a/docs/Install/Installing_Accera_on_Windows.md +++ b/docs/Install/Installing_Accera_on_Windows.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) ## Installing on Windows diff --git a/docs/Install/README.md b/docs/Install/README.md index 93f3f566..b29d3864 100644 --- a/docs/Install/README.md +++ b/docs/Install/README.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) # Install from PyPI The quickest way to get up and running is to install the pre-built Python packages: diff --git a/docs/Manual/00 Introduction.md b/docs/Manual/00 Introduction.md index 7a29671f..3a4272ce 100644 --- a/docs/Manual/00 Introduction.md +++ b/docs/Manual/00 Introduction.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) # Introduction Accera is a framework with a Python-based Domain-specific Language (eDSL) that produces optimized compute-intensive code. Accera's primary focus is the optimization of affine and semi-affine nested for-loops for CPU and GPU targets. diff --git a/docs/Manual/01 Arrays.md b/docs/Manual/01 Arrays.md index 64c441e0..383467c8 100644 --- a/docs/Manual/01 Arrays.md +++ b/docs/Manual/01 Arrays.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) # Section 1: Arrays Accera stores data in multi-dimensional arrays of scalar elements where all the array elements share the same primary data type (e.g., float32, int8). An array has a constant number of dimensions *d* known at compile-time (e.g., a matrix is a 2-dimensional array). Each dimension has a positive size, and the sequence of *d* sizes is called the *shape* of the array. An element of an array is referred to by a *d*-coordinate zero-based *index vector*. diff --git a/docs/Manual/02 Simple Affine Loop Nests.md b/docs/Manual/02 Simple Affine Loop Nests.md index 3a9b4a07..6dbf8fc3 100644 --- a/docs/Manual/02 Simple Affine Loop Nests.md +++ b/docs/Manual/02 Simple Affine Loop Nests.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) # Section 2: Simple affine loop nests This section introduces *loop nests* and their different types that are provided in Accera programming model. @@ -79,7 +79,7 @@ nest.iteration_logic(logic_fn) ``` ## Supported operations -The iteration logic can include the following operations (assuming `accera` was imported as `rp`): +The iteration logic can include the following operations (assuming `accera` was imported as `acc`): ### Assignment operators @@ -156,6 +156,35 @@ Comment: Accera also supports the corresponding compound-assignment operators, s | `acc.cosh(a)` | `acc.ScalarType.float16/32/64` | Returns the hyperbolic cosine of scalar *a*, where *a* is in radians | | `acc.tanh(a)` | `acc.ScalarType.float16/32/64` | Returns the hyperbolic tangent of scalar *a*, where *a* is in radians | +### Implicit type casting + +Accera operators require operands to be the same type. Computations that use multiple types can take advantage of Accera's implicit type casting support when converting from smaller-sized types to larger-sized types. + +To do implicit casting, simply assign a source type to its implicitly-castable destination type. No additional casting operation is needed for converting between these types. + +| Source types | Destination type (implicitly-castable) | +| ------------ | -------------------------------------- | +| `acc.ScalarType.bool`, `acc.ScalarType.uint8` | `acc.ScalarType.int8` | +| `acc.ScalarType.bool`, `acc.ScalarType.int8` | `acc.ScalarType.uint8` | +| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.uint16` | `acc.ScalarType.int16` | +| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.int16` | `acc.ScalarType.uint16` | +| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.int16`, `acc.ScalarType.uint16`, `acc.ScalarType.uint32` | `acc.ScalarType.int32` | +| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.int16`, `acc.ScalarType.uint16`, `acc.ScalarType.int32` | `acc.ScalarType.uint32` | +| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.int16`, `acc.ScalarType.uint16`, `acc.ScalarType.int32`, `acc.ScalarType.uint32`, `acc.ScalarType.uint64` | `acc.ScalarType.int64` | +| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.int16`, `acc.ScalarType.uint16`, `acc.ScalarType.int32`, `acc.ScalarType.uint32`, `acc.ScalarType.int64` | `acc.ScalarType.uint64` | +| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.int16`, `acc.ScalarType.uint16` | `acc.ScalarType.float16` | +| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.int16`, `acc.ScalarType.uint16` | `acc.ScalarType.bfloat16` | +| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.int16`, `acc.ScalarType.uint16`, `acc.ScalarType.int32`, `acc.ScalarType.uint32`, `acc.ScalarType.int64`, `acc.ScalarType.float16`, `acc.ScalarType.bfloat16` | `acc.ScalarType.float32` | +| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.int16`, `acc.ScalarType.uint16`, `acc.ScalarType.int32`, `acc.ScalarType.uint32`, `acc.ScalarType.int64`, `acc.ScalarType.float16`, `acc.ScalarType.bfloat16`, `acc.ScalarType.float32` | `acc.ScalarType.float64` | + +[comment]: # (bool, int8, uint8, int16, uint16, int32, uint32, int64, uint64 | index) + +To override the casting behavior above, or cast a larger-sized type to a smaller-sized type, use the `acc.cast` operation. + +Comment: implicit casting of constants may result in truncation. + +[comment]: # (MISSING: examples for constant implicit casting that cause unexpected truncation) + ## Accera program stages Let’s take a step back to describe the stages of Accera program: diff --git a/docs/Manual/03 Schedules.md b/docs/Manual/03 Schedules.md index 58d8ba95..876ccaa4 100644 --- a/docs/Manual/03 Schedules.md +++ b/docs/Manual/03 Schedules.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) # Section 3: Schedules We begin with `nest` from [Section 2](<02%20Simple%20Affine%20Loop%20Nests.md>) which captures the logic of matrix-matrix multiplication. We use `nest` to create a `Schedule` that controls the execution order of the nest's iterations. Schedules are target-independent in the sense that the same schedule can be used to emit code for multiple target platforms. diff --git a/docs/Manual/04 Fusing.md b/docs/Manual/04 Fusing.md index f363c024..c0795091 100644 --- a/docs/Manual/04 Fusing.md +++ b/docs/Manual/04 Fusing.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) # Section 4: Fusing With `fuse` operation, multiple schedules can be combined into a single schedule representing the union of the work in the original schedules. These fused schedules can be transformed by any of the transformations presented in [Section 3](<03%20Schedules.md>). diff --git a/docs/Manual/05 Targets.md b/docs/Manual/05 Targets.md index 14cacaa2..7d305691 100644 --- a/docs/Manual/05 Targets.md +++ b/docs/Manual/05 Targets.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) # Section 5: Targets Accera is a cross compiler, which means that it can generate executable code for different target platforms. A target is described using the `Target` class. Accera already supports many different targets, for example: diff --git a/docs/Manual/06 Plans - Caching.md b/docs/Manual/06 Plans - Caching.md index 4a589825..83a91d6d 100644 --- a/docs/Manual/06 Plans - Caching.md +++ b/docs/Manual/06 Plans - Caching.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) # Section 6: Plans - Caching In the previous sections, we defined the logic and then scheduled its iterations. Now, let's move on to completing the implementation with target-specific options. diff --git a/docs/Manual/07 Plans - Operations and Optimizations.md b/docs/Manual/07 Plans - Operations and Optimizations.md index 36e91305..f3c535d1 100644 --- a/docs/Manual/07 Plans - Operations and Optimizations.md +++ b/docs/Manual/07 Plans - Operations and Optimizations.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) # Section 7: Plans - Operations and Optimizations We can control target-specific operations and optimizations using a plan. Examples include instruction pipelining, applying SIMD vector instructions, and so on. diff --git a/docs/Manual/08 Deferred Layout of Constant Arrays.md b/docs/Manual/08 Deferred Layout of Constant Arrays.md index 5a8109bd..b8a0d7c9 100644 --- a/docs/Manual/08 Deferred Layout of Constant Arrays.md +++ b/docs/Manual/08 Deferred Layout of Constant Arrays.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) # Section 8: Deferred layout of constant arrays Let's revisit the memory layout of constant arrays. As explained in [Section 1](<01%20Arrays.md>), the contents of constant arrays are known at compile-time, and these contents are immutable. Accera stores constant arrays in a non-standard memory layout optimized for a particular plan. In some cases, storing multiple copies of each array element may even prove advantageous (e.g., storing a matrix in row-major and column-major layouts). diff --git a/docs/Manual/09 Parameters.md b/docs/Manual/09 Parameters.md index b6b15696..fbc8f8f5 100644 --- a/docs/Manual/09 Parameters.md +++ b/docs/Manual/09 Parameters.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) # Section 9: Parameters diff --git a/docs/Manual/10 Packages.md b/docs/Manual/10 Packages.md index b2e3c4ad..67b9a2d2 100644 --- a/docs/Manual/10 Packages.md +++ b/docs/Manual/10 Packages.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) # Section 10: Building Packages The `Package` class represents a collection of Accera-generated functions. Whenever a package is built, it creates a stand-alone function library that other pieces of software can use. Currently, Accera supports two package formats: HAT and MLIR. diff --git a/docs/Manual/README.md b/docs/Manual/README.md index cd1c9d32..62513753 100644 --- a/docs/Manual/README.md +++ b/docs/Manual/README.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) # Accera v1.2.1 Manual diff --git a/docs/Reference/accera.md b/docs/Reference/accera.md index 3bf23e6d..45eb98fc 100644 --- a/docs/Reference/accera.md +++ b/docs/Reference/accera.md @@ -1,9 +1,10 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference # Module functions +* [`accera.cast`](functions/cast.md) `(value, type)` * [`accera.create_parameters`](functions/create_parameters.md) `(number)` * [`accera.create_parameter_grid`](functions/create_parameter_grid.md) `(parameter_choices, filter_func, sample)` * [`accera.fuse`](functions/fuse.md) `(schedules[, partial])` diff --git a/docs/Reference/classes/Array/Array.md b/docs/Reference/classes/Array/Array.md index 8766421f..8ab083ff 100644 --- a/docs/Reference/classes/Array/Array.md +++ b/docs/Reference/classes/Array/Array.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Array(role[, data, element_type, layout, offset, shape])` Constructs an array. diff --git a/docs/Reference/classes/Array/Layout.md b/docs/Reference/classes/Array/Layout.md index c64c3395..c81070dc 100644 --- a/docs/Reference/classes/Array/Layout.md +++ b/docs/Reference/classes/Array/Layout.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Array.Layout` type | description diff --git a/docs/Reference/classes/Array/Role.md b/docs/Reference/classes/Array/Role.md index a2bea01a..0dc36f35 100644 --- a/docs/Reference/classes/Array/Role.md +++ b/docs/Reference/classes/Array/Role.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Array.Role` type | description diff --git a/docs/Reference/classes/Array/deferred_layout.md b/docs/Reference/classes/Array/deferred_layout.md index 4cf5452a..334e22d5 100644 --- a/docs/Reference/classes/Array/deferred_layout.md +++ b/docs/Reference/classes/Array/deferred_layout.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Array.deferred_layout(cache)` Specifies the layout for a `Array.Role.CONST` array based on a `Cache`. For more details, see [Deferred layout of constant arrays](<../../../Manual/08%20Deferred%20Layout%20of%20Constant%20Arrays.md>) diff --git a/docs/Reference/classes/Array/sub_array.md b/docs/Reference/classes/Array/sub_array.md index 177b2b3c..ffa9fde3 100644 --- a/docs/Reference/classes/Array/sub_array.md +++ b/docs/Reference/classes/Array/sub_array.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Array.sub_array(offsets, shape, strides)` Creates a sub-array of a specific shape from an array. The sub-array is created from elements at specified offsets and strides into the original array. diff --git a/docs/Reference/classes/Nest/Nest.md b/docs/Reference/classes/Nest/Nest.md index 464c2935..cdf885fe 100644 --- a/docs/Reference/classes/Nest/Nest.md +++ b/docs/Reference/classes/Nest/Nest.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Nest(shape)` Creates an affine loop nest. diff --git a/docs/Reference/classes/Nest/create_plan.md b/docs/Reference/classes/Nest/create_plan.md index b7299ee4..daf3f898 100644 --- a/docs/Reference/classes/Nest/create_plan.md +++ b/docs/Reference/classes/Nest/create_plan.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Nest.create_plan([target])` Creates a plan using the default schedule for the nest. diff --git a/docs/Reference/classes/Nest/create_schedule.md b/docs/Reference/classes/Nest/create_schedule.md index 46b5c81b..1ee42a5e 100644 --- a/docs/Reference/classes/Nest/create_schedule.md +++ b/docs/Reference/classes/Nest/create_schedule.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Nest.create_schedule()` Create a default schedule for a nest. diff --git a/docs/Reference/classes/Nest/get_indices.md b/docs/Reference/classes/Nest/get_indices.md index f4188cc1..e7455bb5 100644 --- a/docs/Reference/classes/Nest/get_indices.md +++ b/docs/Reference/classes/Nest/get_indices.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Nest.get_indices()` Gets the iteration space dimensions for a nest. diff --git a/docs/Reference/classes/Nest/iteration_logic.md b/docs/Reference/classes/Nest/iteration_logic.md index 33066716..07a523ac 100644 --- a/docs/Reference/classes/Nest/iteration_logic.md +++ b/docs/Reference/classes/Nest/iteration_logic.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.60) +[//]: # (Version: v1.2.70) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Nest.iteration_logic(logic)` Adds an iteration logic function to a `Nest`. diff --git a/docs/Reference/classes/Package/Format.md b/docs/Reference/classes/Package/Format.md index 0a8f7f60..6666aa95 100644 --- a/docs/Reference/classes/Package/Format.md +++ b/docs/Reference/classes/Package/Format.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Package.Format` type | description diff --git a/docs/Reference/classes/Package/Mode.md b/docs/Reference/classes/Package/Mode.md index 44dbae43..3ed99bc2 100644 --- a/docs/Reference/classes/Package/Mode.md +++ b/docs/Reference/classes/Package/Mode.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Package.Mode` type | description diff --git a/docs/Reference/classes/Package/Package.md b/docs/Reference/classes/Package/Package.md index faa4b3c3..c4c42939 100644 --- a/docs/Reference/classes/Package/Package.md +++ b/docs/Reference/classes/Package/Package.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Package.Package()` A package of functions that can be built and linked with client code. diff --git a/docs/Reference/classes/Package/Platform.md b/docs/Reference/classes/Package/Platform.md index d85fa9cd..970c209d 100644 --- a/docs/Reference/classes/Package/Platform.md +++ b/docs/Reference/classes/Package/Platform.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Package.Platform` type | description diff --git a/docs/Reference/classes/Package/add.md b/docs/Reference/classes/Package/add.md index b0761257..2c929715 100644 --- a/docs/Reference/classes/Package/add.md +++ b/docs/Reference/classes/Package/add.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Package.add(source, args[, base_name, parameters])` Adds one or more functions to the package. diff --git a/docs/Reference/classes/Package/add_description.md b/docs/Reference/classes/Package/add_description.md index 1d1249d1..41e25edc 100644 --- a/docs/Reference/classes/Package/add_description.md +++ b/docs/Reference/classes/Package/add_description.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Package.add_description([author, license, other, version])` Adds descriptive metadata to the HAT package. diff --git a/docs/Reference/classes/Package/build.md b/docs/Reference/classes/Package/build.md index acf294e2..707188e9 100644 --- a/docs/Reference/classes/Package/build.md +++ b/docs/Reference/classes/Package/build.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Package.build(name[, format, mode, platform, tolerance, output_dir])` Builds a HAT package. diff --git a/docs/Reference/classes/Plan/bind.md b/docs/Reference/classes/Plan/bind.md index 4c2a49ef..1e3de9ad 100644 --- a/docs/Reference/classes/Plan/bind.md +++ b/docs/Reference/classes/Plan/bind.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Plan.bind(mapping)` Only available for targets that can execute a grid of work (such as GPUs). The `bind` function binds dimensions of the iteration space to axes of the target-specific grid (such as `v100.GridUnit.BLOCK_X`, `v100.GridUnit.THREAD_X` on an Nvidia GPU). diff --git a/docs/Reference/classes/Plan/cache.md b/docs/Reference/classes/Plan/cache.md index 41262435..09c2d368 100644 --- a/docs/Reference/classes/Plan/cache.md +++ b/docs/Reference/classes/Plan/cache.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Plan.cache(source[, index, trigger_index, layout, level, trigger_level, max_elements, thrifty, location, double_buffer])` Adds a caching strategy to a plan. diff --git a/docs/Reference/classes/Plan/kernelize.md b/docs/Reference/classes/Plan/kernelize.md index db6f1c8d..4a860f74 100644 --- a/docs/Reference/classes/Plan/kernelize.md +++ b/docs/Reference/classes/Plan/kernelize.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Plan.kernelize(unroll_indices[, vectorize_indices])` A convenience method for a sequence of `unroll` instructions followed by a possible sequence of `vectorize` instructions. diff --git a/docs/Reference/classes/Plan/parallelize.md b/docs/Reference/classes/Plan/parallelize.md index 4f07b468..83fb0e13 100644 --- a/docs/Reference/classes/Plan/parallelize.md +++ b/docs/Reference/classes/Plan/parallelize.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Plan.parallelize(indices[, pin, policy])` diff --git a/docs/Reference/classes/Plan/tensorize.md b/docs/Reference/classes/Plan/tensorize.md index 40752110..7f4389c7 100644 --- a/docs/Reference/classes/Plan/tensorize.md +++ b/docs/Reference/classes/Plan/tensorize.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Plan.tensorize(indices, mma_shape [, use_static_offsets, num_total_passes, num_fused_passes, scheduling_policy])` Only available for targets with native matrix multiplication instruction (tensor core) support. Marks the dimensions of the iteration-space for tensorization. Only perfectly nested loops of the following form can be tensorized: diff --git a/docs/Reference/classes/Plan/unroll.md b/docs/Reference/classes/Plan/unroll.md index 661f02cb..c1b288e7 100644 --- a/docs/Reference/classes/Plan/unroll.md +++ b/docs/Reference/classes/Plan/unroll.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Plan.unroll(index)` Marks a dimension of the iteration-space for unrolling. diff --git a/docs/Reference/classes/Plan/vectorize.md b/docs/Reference/classes/Plan/vectorize.md index f390190b..a8ac6eeb 100644 --- a/docs/Reference/classes/Plan/vectorize.md +++ b/docs/Reference/classes/Plan/vectorize.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Plan.vectorize(index)` Only available for targets that have SIMD registers and support vector instructions. Marks a dimension of the iteration-space for vectorization. diff --git a/docs/Reference/classes/Schedule/create_plan.md b/docs/Reference/classes/Schedule/create_plan.md index 85374f9b..75fa8f6b 100644 --- a/docs/Reference/classes/Schedule/create_plan.md +++ b/docs/Reference/classes/Schedule/create_plan.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Schedule.create_plan([target])` Creates a plan for running this schedule. diff --git a/docs/Reference/classes/Schedule/is_valid_loop_order.md b/docs/Reference/classes/Schedule/is_valid_loop_order.md index 076d9b6e..a258a407 100644 --- a/docs/Reference/classes/Schedule/is_valid_loop_order.md +++ b/docs/Reference/classes/Schedule/is_valid_loop_order.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Schedule.is_valid_loop_order(*order)` The `is_valid_loop_order` function determines if an order of indices is valid. For a description of valid schedule orders, refer to [reorder](reorder.md). diff --git a/docs/Reference/classes/Schedule/pad.md b/docs/Reference/classes/Schedule/pad.md index c4859a21..b7b8ce84 100644 --- a/docs/Reference/classes/Schedule/pad.md +++ b/docs/Reference/classes/Schedule/pad.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Schedule.pad(index, size)` Pads the beginning of a specified dimension of the iteration-space with empty (no-op) elements. diff --git a/docs/Reference/classes/Schedule/reorder.md b/docs/Reference/classes/Schedule/reorder.md index 43e2bd11..243f8587 100644 --- a/docs/Reference/classes/Schedule/reorder.md +++ b/docs/Reference/classes/Schedule/reorder.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Schedule.reorder(order, *args)` The `reorder` transformation sets the order of the indices in the schedule. diff --git a/docs/Reference/classes/Schedule/skew.md b/docs/Reference/classes/Schedule/skew.md index b8f48ddd..3114c610 100644 --- a/docs/Reference/classes/Schedule/skew.md +++ b/docs/Reference/classes/Schedule/skew.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Schedule.skew(index, reference_index [, unroll_loops_smaller_than])` Transforms a dimension with respect to a reference dimension into a parallelogram by padding with empty elements. diff --git a/docs/Reference/classes/Schedule/split.md b/docs/Reference/classes/Schedule/split.md index 1909b862..7a316052 100644 --- a/docs/Reference/classes/Schedule/split.md +++ b/docs/Reference/classes/Schedule/split.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Schedule.split(index, size)` The `split` transformation takes a dimension `i` and a `size`, modifies `i`, and creates a new dimension `ii`. diff --git a/docs/Reference/classes/Schedule/tile.md b/docs/Reference/classes/Schedule/tile.md index ff0d18e1..8514cf1d 100644 --- a/docs/Reference/classes/Schedule/tile.md +++ b/docs/Reference/classes/Schedule/tile.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Schedule.tile(shape)` The `tile` transformation is a convenience syntax that takes a tuple of indices and a tuple of sizes, and splits each index by the corresponding size. The indices involved in the split are then ordered such that all the outer indices precede all of their respective inner indices. diff --git a/docs/Reference/classes/Target/Architecture.md b/docs/Reference/classes/Target/Architecture.md index 62611088..0ae50bcf 100644 --- a/docs/Reference/classes/Target/Architecture.md +++ b/docs/Reference/classes/Target/Architecture.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Target.Architecture` Defines the supported target architectures. diff --git a/docs/Reference/classes/Target/Category.md b/docs/Reference/classes/Target/Category.md index 73e995ef..4de78aa9 100644 --- a/docs/Reference/classes/Target/Category.md +++ b/docs/Reference/classes/Target/Category.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Target.Category` Defines the target processor category. diff --git a/docs/Reference/classes/Target/Model.md b/docs/Reference/classes/Target/Model.md index ea26c6d0..98940157 100644 --- a/docs/Reference/classes/Target/Model.md +++ b/docs/Reference/classes/Target/Model.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Target.Model` Defines constants for some well-known CPU models. diff --git a/docs/Reference/classes/Target/Runtime.md b/docs/Reference/classes/Target/Runtime.md index 94726176..7e004789 100644 --- a/docs/Reference/classes/Target/Runtime.md +++ b/docs/Reference/classes/Target/Runtime.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Target.Runtime` The runtime for code generation and/or compilation. diff --git a/docs/Reference/classes/Target/Target.md b/docs/Reference/classes/Target/Target.md index 7ae616a3..53f18231 100644 --- a/docs/Reference/classes/Target/Target.md +++ b/docs/Reference/classes/Target/Target.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.60) +[//]: # (Version: v1.2.70) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.Target([architecture, cache_lines, cache_sizes, category, extensions, family, frequency_GHz, known_name, model, name, num_cores, num_threads, runtime, tensor_core_info, turbo_frequency_GHz, vector_bytes, vector_registers)` diff --git a/docs/Reference/enumerations/MMASchedulingPolicy.md b/docs/Reference/enumerations/MMASchedulingPolicy.md index 6f42ba32..81c1bc2f 100644 --- a/docs/Reference/enumerations/MMASchedulingPolicy.md +++ b/docs/Reference/enumerations/MMASchedulingPolicy.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.MMASchedulingPolicy` type | description diff --git a/docs/Reference/enumerations/MMAShape.md b/docs/Reference/enumerations/MMAShape.md index e792175f..8468d2a0 100644 --- a/docs/Reference/enumerations/MMAShape.md +++ b/docs/Reference/enumerations/MMAShape.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.MMAShape` The following table shows the matrix multiplication parameters associated with the different enum values, for different data types for a single pass. So for example a single pass of the `M32xN32xK2_B1` operation would take input matrices of dimensions [32x2] (A) and [2x32] (B) to produce a matrix multiplication result of dimensions [32x32] (C). These operations can then be composed together to perform matrix multiplication of larger matrices. diff --git a/docs/Reference/enumerations/ScalarType.md b/docs/Reference/enumerations/ScalarType.md index 881b0d77..f7dccd0a 100644 --- a/docs/Reference/enumerations/ScalarType.md +++ b/docs/Reference/enumerations/ScalarType.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.ScalarType` type | description diff --git a/docs/Reference/functions/cast.md b/docs/Reference/functions/cast.md new file mode 100644 index 00000000..a51c2a43 --- /dev/null +++ b/docs/Reference/functions/cast.md @@ -0,0 +1,77 @@ +[//]: # (Project: Accera) +[//]: # (Version: v1.2.7) + +# Accera v1.2.7 Reference + +## `accera.cast(value, type)` +The `cast` operation converts a value from one `acc.ScalarType` to another. + +Accera performs implicit casting between most types. Therefore, this operation should only be used to override the implicit casting behavior documented in [Section 2](<../../Manual/02%20Simple%20Affine%20Loop%20Nests.md>). + +Limitation: casting constants may result in truncation. + +[comment]: # (MISSING: examples for constant casting that cause unexpected truncation) + + +## Arguments + +argument | description | type/default +--- | --- | --- +`value` | The value to cast | +`type` | The destination type | `acc.ScalarType` + +## Returns +The result after casting + +## Examples + +Casting from float32 to int16: + +```python +A = acc.Array(role=acc.Array.Role.INPUT, element_type=acc.ScalarType.float32, shape=(10, 20)) +B = acc.Array(role=acc.Array.Role.INPUT_OUTPUT, element_type=acc.ScalarType.int16, shape=(10, 20)) + +nest = acc.Nest(10, 20) +i, j = nest.get_indices() + +@nest.iteration_logic: +def _(): + B[i, j] = acc.cast(A[i, j], acc.ScalarType.int16) # explicit cast to int16 + ... +``` + +In comparison, casting from int16 to float32 is implicit, which means the `cast` operation can be omitted: + +```python +A = acc.Array(role=acc.Array.Role.INPUT, element_type=acc.ScalarType.int16, shape=(10, 20)) +B = acc.Array(role=acc.Array.Role.INPUT_OUTPUT, element_type=acc.ScalarType.float32, shape=(10, 20)) + +nest = acc.Nest(10, 20) +i, j = nest.get_indices() + +@nest.iteration_logic: +def _(): + B[i, j] = A[i, j] # implicit cast to float32 + ... +``` + +Casting a constant to int8: + +```python + +A = acc.Array(role=acc.Array.Role.INPUT_OUTPUT, element_type=acc.ScalarType.int8, shape=(10, 20)) + +nest = acc.Nest(10, 20) +i, j = nest.get_indices() + +@nest.iteration_logic: +def _(): + A[i, j] = acc.cast(10, acc.ScalarType.int8) + ... + +``` + + +
+ + diff --git a/docs/Reference/functions/create_parameter_grid.md b/docs/Reference/functions/create_parameter_grid.md index d3d0dedd..9c61b1ad 100644 --- a/docs/Reference/functions/create_parameter_grid.md +++ b/docs/Reference/functions/create_parameter_grid.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.create_parameter_grid(parameter_choices, filter_func, sample, seed)` Create a parameter grid from a dictionary that maps each parameter to its possible values. diff --git a/docs/Reference/functions/create_parameters.md b/docs/Reference/functions/create_parameters.md index 9fd46de7..8ffc9e82 100644 --- a/docs/Reference/functions/create_parameters.md +++ b/docs/Reference/functions/create_parameters.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.create_parameters()` Creates placeholder parameters. diff --git a/docs/Reference/functions/fuse.md b/docs/Reference/functions/fuse.md index ca7880fd..090d3bf4 100644 --- a/docs/Reference/functions/fuse.md +++ b/docs/Reference/functions/fuse.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference ## `accera.fuse(schedules[, *args, partial])` The `fuse` operation combines multiple iteration spaces into a single "fused" iteration space. The fused iteration space represents the union of the work in the original spaces. diff --git a/docs/Reference/safety_analysis.md b/docs/Reference/safety_analysis.md index ed2c91de..1ac43468 100644 --- a/docs/Reference/safety_analysis.md +++ b/docs/Reference/safety_analysis.md @@ -1,7 +1,7 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) -# Accera v1.2.6 Reference +# Accera v1.2.7 Reference # Safety Analysis diff --git a/docs/Tutorials/Hello_MatMul.md b/docs/Tutorials/Hello_MatMul.md index 20645bbf..67d99a94 100644 --- a/docs/Tutorials/Hello_MatMul.md +++ b/docs/Tutorials/Hello_MatMul.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) ## Hello MatMul diff --git a/docs/Tutorials/Hello_MatMul_GPU.md b/docs/Tutorials/Hello_MatMul_GPU.md index a0e47136..675db836 100644 --- a/docs/Tutorials/Hello_MatMul_GPU.md +++ b/docs/Tutorials/Hello_MatMul_GPU.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) ## Hello MatMul GPU diff --git a/docs/Tutorials/Optimized_MatMul.md b/docs/Tutorials/Optimized_MatMul.md index c29c3b88..301c01b1 100644 --- a/docs/Tutorials/Optimized_MatMul.md +++ b/docs/Tutorials/Optimized_MatMul.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) ## Optimized MatMul diff --git a/docs/Tutorials/Pi3_Cross_Compilation.md b/docs/Tutorials/Pi3_Cross_Compilation.md index 50274887..b53f60e8 100644 --- a/docs/Tutorials/Pi3_Cross_Compilation.md +++ b/docs/Tutorials/Pi3_Cross_Compilation.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) # Cross Compiling for the Raspberry Pi 3 diff --git a/docs/Tutorials/README.md b/docs/Tutorials/README.md index b21939e2..817bec04 100644 --- a/docs/Tutorials/README.md +++ b/docs/Tutorials/README.md @@ -1,5 +1,5 @@ [//]: # (Project: Accera) -[//]: # (Version: v1.2.6) +[//]: # (Version: v1.2.7) # Accera Tutorials diff --git a/tools/benchmarkers/gpu_benchmark_tool.py b/tools/benchmarkers/gpu_benchmark_tool.py index 44cb910e..bb1690d0 100755 --- a/tools/benchmarkers/gpu_benchmark_tool.py +++ b/tools/benchmarkers/gpu_benchmark_tool.py @@ -210,7 +210,7 @@ def prepare_system_for_benchmark(target, available_gpus): def main(args=[]): parser = argparse.ArgumentParser() parser.add_argument('-d', '--devices', help='The devices to use for the benchmark', required=False, default="0,1,2,3") - parser.add_argument('-i', '--input', help='The input config file (csv)', required=False) + parser.add_argument('-i', '--input', help='Comma-separated list of input config files (csv)', required=False) parser.add_argument('-y', '--type', help='The data type for the input set, h or fp16, s for fp32', required=False) parser.add_argument('-b', '--branch', help='The git branch to use to tag the results to', required=False) parser.add_argument('-z', '--string', help='input config string (csv, semi-colon per row)', required=False) @@ -241,15 +241,18 @@ def main(args=[]): args.string = ','.join(gemm_opts.CONFIG_HEADERS) + '\n' + '\n'.join(args.string.split(';')) f = StringIO(args.string) - try: - if f is None: - f = open(args.input) + gemm_inputs = [] + if f is None: + input_files = args.input.split(",") + for file in input_files: + try: + f = open(file) - reader = csv.DictReader(f, gemm_opts.CONFIG_HEADERS) - gemm_inputs = [gemm_opts.GemmOpts(**data) for data in islice(reader, 1, None)] + reader = csv.DictReader(f, gemm_opts.CONFIG_HEADERS) + gemm_inputs += [gemm_opts.GemmOpts(**data) for data in islice(reader, 1, None)] - finally: - f.close() + finally: + f.close() available_gpus = [] devices = args.devices.split(",")