From 1d3b1a396ea2e0a7ab3fc314c4ceb38de52284df Mon Sep 17 00:00:00 2001
From: Kern Handa <kern.handa@gmail.com>
Date: Tue, 12 Jul 2022 10:23:04 -0700
Subject: [PATCH] Squashed commit of the following:

commit f3a1a2becb6740ae8cf7873b5029c6df140f5c19
Author: Kern Handa <kerha@microsoft.com>
Date:   Tue Jul 12 16:52:41 2022 +0000

    Merged PR 2744: [doc] Fixes link in reference/functions/cast.md, revs version on all docs

    [doc] Fixes link in reference/functions/cast.md

commit 23f4c8fbf2415b02e8b0090a76380d34790205fa
Author: Lisa Ong <onglisa@microsoft.com>
Date:   Tue Jul 12 05:55:48 2022 +0000

    Merged PR 2743: [DSL] Document implicit casting rules and the explicit `cast` function

    * Document implicit casting rules implemented by !2693
    * Promote `acc.cast` to a documented function to give the user control to override implicit casting behavior

commit 3ec63b62705327a65decc4da7ec4cb5412dc7299
Author: Kern Handa <kerha@microsoft.com>
Date:   Mon Jul 11 23:57:23 2022 +0000

    Merged PR 2739: Updates ROCM tensorization pattern to handle casting

    Updates ROCM tensorization pattern to handle casting

commit 60c082dd38ff1b0bc030a7e28dc19f553bad9099
Author: Mason Remy <masonr@microsoft.com>
Date:   Mon Jul 11 22:58:42 2022 +0000

    Merged PR 2643: Some fixes for last major array caching in tensorization

    Some fixes for last major array caching in tensorization

commit 812c3065b7d4d6c9d716acf4fb1df4be66ef101d
Author: Kern Handa <kerha@microsoft.com>
Date:   Mon Jul 11 20:43:12 2022 +0000

    Merged PR 2693: Updates DSL codegen to implicitly cast if possible

    Updates DSL codegen to implicitly cast if possible

commit 6ed316e50e8f9e398f9ee6b8bfa8e6aa05fbffb1
Author: Ritwik Das <ritdas@microsoft.com>
Date:   Sat Jul 9 05:52:22 2022 +0000

    Merged PR 2735: Pass multiple input files as comma-separated list to benchmark tool

    https://intelligentdevices.visualstudio.com/ELL/_build/results?buildId=41588&view=logs&j=d78921a4-2f18-50b0-77ad-4c6803f3371b&t=f97c60f6-ada7-5ec9-5ea1-510216c408e9

    Above pipeline did not run the 2nd set of input sizes since the 1st process did not exit until pipeline timeout was hit. After the fix, we will always have a single job.
---
 .azure/cuda/cuda-benchmark-baseline.yml       |  18 +-
 .azure/cuda/cuda-benchmark-fp16-bert.yml      |   3 -
 .azure/cuda/cuda-benchmark-fp16-big.yml       |   9 +-
 .azure/cuda/cuda-benchmark-fp16.yml           |   9 +-
 .azure/cuda/cuda-benchmark-fp32-bert.yml      |   3 -
 .azure/cuda/cuda-benchmark-fp32-big.yml       |  11 +-
 .azure/cuda/cuda-benchmark-fp32-resnet.yml    |   3 -
 .azure/cuda/cuda-benchmark-fp32.yml           |  10 +-
 .azure/rocm/rocm-benchmark-baseline.yml       |  18 +-
 .azure/rocm/rocm-benchmark-fp16-bert.yml      |   3 -
 .azure/rocm/rocm-benchmark-fp16-big.yml       |  11 +-
 .azure/rocm/rocm-benchmark-fp16.yml           |  11 +-
 .azure/rocm/rocm-benchmark-fp32-bert.yml      |   3 -
 .azure/rocm/rocm-benchmark-fp32-big.yml       |  11 +-
 .azure/rocm/rocm-benchmark-fp32-resnet.yml    |   3 -
 .azure/rocm/rocm-benchmark-fp32.yml           |  13 +-
 accera/acc-opt/test/thrifty_caching.mlir      |   4 +-
 accera/acc-opt/test/value_mlir_test.cpp       |   4 +-
 accera/ir/include/exec/ExecutionPlanOps.td    |   4 +-
 accera/ir/include/value/ValueOps.td           |  28 +-
 accera/ir/src/exec/ExecutionPlanOps.cpp       |   3 +
 accera/python/accera/__init__.py              |   2 +-
 accera/python/accera/test/dsl_tests.py        |  29 +-
 accera/python/accera/test/mfma_tests.py       | 214 ++++++++++++++-
 accera/python/accera/test/smoke_tests.py      |  20 +-
 accera/python/accera/test/unit_tests.py       |  22 +-
 accera/python/lib/src/ContainerTypes.cpp      |  17 +-
 accera/python/lib/src/SchedulingTypes.cpp     |   2 +-
 .../ExecutionPlanToAffineLoweringPass.cpp     |  59 +++-
 accera/transforms/src/gpu/AcceraToGPUPass.cpp |  46 +++-
 accera/utilities/include/TypeTraits.h         |   1 +
 accera/value/include/EmitterContext.h         |   8 +-
 accera/value/include/MLIREmitterContext.h     |   4 +-
 accera/value/include/Scalar.h                 |   5 +-
 accera/value/include/Value.h                  |  13 +-
 accera/value/include/ValueType.h              |  55 ++++
 accera/value/src/EmitterContext.cpp           |  14 +-
 accera/value/src/MLIREmitterContext.cpp       |  70 ++---
 accera/value/src/Scalar.cpp                   | 255 ++++++++++++++----
 accera/value/src/ScalarOperations.cpp         |  25 +-
 accera/value/src/Value.cpp                    | 187 ++++++++++---
 accera/value/src/ValueOperations.cpp          |  10 -
 docs/.bumpversion.cfg                         |   2 +-
 docs/Case Studies/CONTRIBUTING.md             |   2 +-
 docs/Case Studies/README.md                   |   2 +-
 docs/Install/Building_on_MacOS.md             |   2 +-
 docs/Install/Building_on_Ubuntu.md            |   2 +-
 docs/Install/Building_on_Windows.md           |   2 +-
 docs/Install/Installing_Accera_on_MacOS.md    |   2 +-
 docs/Install/Installing_Accera_on_Ubuntu.md   |   2 +-
 docs/Install/Installing_Accera_on_Windows.md  |   2 +-
 docs/Install/README.md                        |   2 +-
 docs/Manual/00 Introduction.md                |   2 +-
 docs/Manual/01 Arrays.md                      |   2 +-
 docs/Manual/02 Simple Affine Loop Nests.md    |  33 ++-
 docs/Manual/03 Schedules.md                   |   2 +-
 docs/Manual/04 Fusing.md                      |   2 +-
 docs/Manual/05 Targets.md                     |   2 +-
 docs/Manual/06 Plans - Caching.md             |   2 +-
 ...07 Plans - Operations and Optimizations.md |   2 +-
 .../08 Deferred Layout of Constant Arrays.md  |   2 +-
 docs/Manual/09 Parameters.md                  |   2 +-
 docs/Manual/10 Packages.md                    |   2 +-
 docs/Manual/README.md                         |   2 +-
 docs/Reference/accera.md                      |   5 +-
 docs/Reference/classes/Array/Array.md         |   4 +-
 docs/Reference/classes/Array/Layout.md        |   4 +-
 docs/Reference/classes/Array/Role.md          |   4 +-
 .../classes/Array/deferred_layout.md          |   4 +-
 docs/Reference/classes/Array/sub_array.md     |   4 +-
 docs/Reference/classes/Nest/Nest.md           |   4 +-
 docs/Reference/classes/Nest/create_plan.md    |   4 +-
 .../Reference/classes/Nest/create_schedule.md |   4 +-
 docs/Reference/classes/Nest/get_indices.md    |   4 +-
 .../Reference/classes/Nest/iteration_logic.md |   4 +-
 docs/Reference/classes/Package/Format.md      |   4 +-
 docs/Reference/classes/Package/Mode.md        |   4 +-
 docs/Reference/classes/Package/Package.md     |   4 +-
 docs/Reference/classes/Package/Platform.md    |   4 +-
 docs/Reference/classes/Package/add.md         |   4 +-
 .../classes/Package/add_description.md        |   4 +-
 docs/Reference/classes/Package/build.md       |   4 +-
 docs/Reference/classes/Plan/bind.md           |   4 +-
 docs/Reference/classes/Plan/cache.md          |   4 +-
 docs/Reference/classes/Plan/kernelize.md      |   4 +-
 docs/Reference/classes/Plan/parallelize.md    |   4 +-
 docs/Reference/classes/Plan/tensorize.md      |   4 +-
 docs/Reference/classes/Plan/unroll.md         |   4 +-
 docs/Reference/classes/Plan/vectorize.md      |   4 +-
 .../Reference/classes/Schedule/create_plan.md |   4 +-
 .../classes/Schedule/is_valid_loop_order.md   |   4 +-
 docs/Reference/classes/Schedule/pad.md        |   4 +-
 docs/Reference/classes/Schedule/reorder.md    |   4 +-
 docs/Reference/classes/Schedule/skew.md       |   4 +-
 docs/Reference/classes/Schedule/split.md      |   4 +-
 docs/Reference/classes/Schedule/tile.md       |   4 +-
 docs/Reference/classes/Target/Architecture.md |   4 +-
 docs/Reference/classes/Target/Category.md     |   4 +-
 docs/Reference/classes/Target/Model.md        |   4 +-
 docs/Reference/classes/Target/Runtime.md      |   4 +-
 docs/Reference/classes/Target/Target.md       |   4 +-
 .../enumerations/MMASchedulingPolicy.md       |   4 +-
 docs/Reference/enumerations/MMAShape.md       |   4 +-
 docs/Reference/enumerations/ScalarType.md     |   4 +-
 docs/Reference/functions/cast.md              |  77 ++++++
 .../functions/create_parameter_grid.md        |   4 +-
 docs/Reference/functions/create_parameters.md |   4 +-
 docs/Reference/functions/fuse.md              |   4 +-
 docs/Reference/safety_analysis.md             |   4 +-
 docs/Tutorials/Hello_MatMul.md                |   2 +-
 docs/Tutorials/Hello_MatMul_GPU.md            |   2 +-
 docs/Tutorials/Optimized_MatMul.md            |   2 +-
 docs/Tutorials/Pi3_Cross_Compilation.md       |   2 +-
 docs/Tutorials/README.md                      |   2 +-
 tools/benchmarkers/gpu_benchmark_tool.py      |  19 +-
 115 files changed, 1131 insertions(+), 467 deletions(-)
 create mode 100644 docs/Reference/functions/cast.md

diff --git a/.azure/cuda/cuda-benchmark-baseline.yml b/.azure/cuda/cuda-benchmark-baseline.yml
index 41e00371..34ee2931 100644
--- a/.azure/cuda/cuda-benchmark-baseline.yml
+++ b/.azure/cuda/cuda-benchmark-baseline.yml
@@ -54,13 +54,8 @@ jobs:
 
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_rectangle_A6000.csv
-          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_square.csv
-          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_bert_assorted.csv
-          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_rectangle_A6000.csv
-          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_square.csv
-          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_bert_assorted.csv
-          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_resnet_inception.csv
+          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_rectangle_A6000.csv,gemm_square.csv,gemm_bert_assorted.csv
+          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cublas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/cublas/cublas_gemm --input gemm_rectangle_A6000.csv,gemm_square.csv,gemm_bert_assorted.csv,gemm_resnet_inception.csv
         displayName: Run CUBLAS benchmarks
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:
@@ -76,13 +71,8 @@ jobs:
 
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_rectangle_A6000.csv
-          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_square.csv
-          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_bert_assorted.csv
-          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_rectangle_A6000.csv
-          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_square.csv
-          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_bert_assorted.csv
-          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_resnet_inception.csv
+          python gpu_benchmark_tool.py --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_rectangle_A6000.csv,gemm_square.csv,gemm_bert_assorted.csv
+          python gpu_benchmark_tool.py --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --cutlass $(System.DefaultWorkingDirectory)/cutlass/build/tools/profiler/cutlass_profiler --input gemm_rectangle_A6000.csv,gemm_square.csv,gemm_bert_assorted.csv,gemm_resnet_inception.csv
         displayName: Run CUTLASS benchmarks
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:
diff --git a/.azure/cuda/cuda-benchmark-fp16-bert.yml b/.azure/cuda/cuda-benchmark-fp16-bert.yml
index f5963e6b..bc129347 100644
--- a/.azure/cuda/cuda-benchmark-fp16-bert.yml
+++ b/.azure/cuda/cuda-benchmark-fp16-bert.yml
@@ -46,9 +46,6 @@ jobs:
         displayName: Python build
         workingDirectory: "$(Build.SourcesDirectory)"
 
-      # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to
-      # a comma-separated list of device IDs
-      # e.g. VISIBLE_DEVICES="0, 3"
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
           python gpu_benchmark_tool.py --input gemm_bert_assorted.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
diff --git a/.azure/cuda/cuda-benchmark-fp16-big.yml b/.azure/cuda/cuda-benchmark-fp16-big.yml
index f526df4e..0e03ec49 100644
--- a/.azure/cuda/cuda-benchmark-fp16-big.yml
+++ b/.azure/cuda/cuda-benchmark-fp16-big.yml
@@ -48,16 +48,9 @@ jobs:
 
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_big_A6000.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+          python gpu_benchmark_tool.py --input gemm_big_A6000.csv,gemm_big.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
         displayName: Run fp16 benchmarks BIG A6000
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:
           ACCOUNT_KEY: $(ACCOUNT_KEY)
 
-      - bash: |
-          export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_big.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
-        displayName: Run fp16 benchmarks BIG
-        workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
-        env:
-          ACCOUNT_KEY: $(ACCOUNT_KEY)
diff --git a/.azure/cuda/cuda-benchmark-fp16.yml b/.azure/cuda/cuda-benchmark-fp16.yml
index 0c1148c8..a7755b68 100644
--- a/.azure/cuda/cuda-benchmark-fp16.yml
+++ b/.azure/cuda/cuda-benchmark-fp16.yml
@@ -48,16 +48,9 @@ jobs:
 
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_small_A6000.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+          python gpu_benchmark_tool.py --input gemm_small_A6000.csv,gemm_small.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
         displayName: Run fp16 benchmarks A6000
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:
           ACCOUNT_KEY: $(ACCOUNT_KEY)
 
-      - bash: |
-          export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_small.csv --type h --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
-        displayName: Run fp16 benchmarks
-        workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
-        env:
-          ACCOUNT_KEY: $(ACCOUNT_KEY)
diff --git a/.azure/cuda/cuda-benchmark-fp32-bert.yml b/.azure/cuda/cuda-benchmark-fp32-bert.yml
index cb374382..d9642653 100644
--- a/.azure/cuda/cuda-benchmark-fp32-bert.yml
+++ b/.azure/cuda/cuda-benchmark-fp32-bert.yml
@@ -46,9 +46,6 @@ jobs:
         displayName: Python build
         workingDirectory: "$(Build.SourcesDirectory)"
 
-      # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to
-      # a comma-separated list of device IDs
-      # e.g. VISIBLE_DEVICES="0, 3"
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
           python gpu_benchmark_tool.py --input gemm_bert_assorted.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
diff --git a/.azure/cuda/cuda-benchmark-fp32-big.yml b/.azure/cuda/cuda-benchmark-fp32-big.yml
index e73e3a44..47a2bce8 100644
--- a/.azure/cuda/cuda-benchmark-fp32-big.yml
+++ b/.azure/cuda/cuda-benchmark-fp32-big.yml
@@ -48,17 +48,8 @@ jobs:
 
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_big_A6000.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+          python gpu_benchmark_tool.py --input gemm_big_A6000.csv,gemm_big.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
         displayName: Run fp32 benchmarks BIG A6000
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:
           ACCOUNT_KEY: $(ACCOUNT_KEY)
-
-      - bash: |
-          export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_big.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
-        displayName: Run fp32 benchmarks BIG
-        workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
-        env:
-          ACCOUNT_KEY: $(ACCOUNT_KEY)
-
diff --git a/.azure/cuda/cuda-benchmark-fp32-resnet.yml b/.azure/cuda/cuda-benchmark-fp32-resnet.yml
index 53cc2260..f10b4362 100644
--- a/.azure/cuda/cuda-benchmark-fp32-resnet.yml
+++ b/.azure/cuda/cuda-benchmark-fp32-resnet.yml
@@ -46,9 +46,6 @@ jobs:
         displayName: Python build
         workingDirectory: "$(Build.SourcesDirectory)"
 
-      # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to
-      # a comma-separated list of device IDs
-      # e.g. VISIBLE_DEVICES="0, 3"
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
           python gpu_benchmark_tool.py --input gemm_resnet_inception.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
diff --git a/.azure/cuda/cuda-benchmark-fp32.yml b/.azure/cuda/cuda-benchmark-fp32.yml
index 5d479913..b371ebf1 100644
--- a/.azure/cuda/cuda-benchmark-fp32.yml
+++ b/.azure/cuda/cuda-benchmark-fp32.yml
@@ -48,16 +48,8 @@ jobs:
 
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_small_A6000.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+          python gpu_benchmark_tool.py --input gemm_small_A6000.csv,gemm_small.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
         displayName: Run fp32 benchmarks A6000
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:
           ACCOUNT_KEY: $(ACCOUNT_KEY)
-
-      - bash: |
-          export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --input gemm_small.csv --type s --target 'NVidia RTX A6000' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
-        displayName: Run fp32 benchmarks
-        workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
-        env:
-          ACCOUNT_KEY: $(ACCOUNT_KEY)
\ No newline at end of file
diff --git a/.azure/rocm/rocm-benchmark-baseline.yml b/.azure/rocm/rocm-benchmark-baseline.yml
index bd2bb907..2e4bdeee 100644
--- a/.azure/rocm/rocm-benchmark-baseline.yml
+++ b/.azure/rocm/rocm-benchmark-baseline.yml
@@ -52,13 +52,8 @@ jobs:
 
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --rocblas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/rocblas/rocblas_gemm --input gemm_rectangle_MI100.csv
-          python gpu_benchmark_tool.py --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --rocblas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/rocblas/rocblas_gemm --input gemm_square.csv
-          python gpu_benchmark_tool.py --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --rocblas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/rocblas/rocblas_gemm --input gemm_bert_assorted.csv
-          python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --rocblas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/rocblas/rocblas_gemm --input gemm_rectangle_MI100.csv
-          python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --rocblas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/rocblas/rocblas_gemm --input gemm_square.csv
-          python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --rocblas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/rocblas/rocblas_gemm --input gemm_bert_assorted.csv
-          python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --rocblas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/rocblas/rocblas_gemm --input gemm_resnet_inception.csv
+          python gpu_benchmark_tool.py --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --rocblas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/rocblas/rocblas_gemm --input gemm_rectangle_MI100.csv,gemm_square.csv,gemm_bert_assorted.csv
+          python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --rocblas $(Build.SourcesDirectory)/build/temp.linux-x86_64-3.8/tools/benchmarkers/rocblas/rocblas_gemm --input gemm_rectangle_MI100.csv,gemm_square.csv,gemm_bert_assorted.csv,gemm_resnet_inception.csv
         displayName: Run ROCBLAS benchmarks
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:
@@ -73,13 +68,8 @@ jobs:
 
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          python gpu_benchmark_tool.py --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --composable_kernel $(System.DefaultWorkingDirectory)/composable_kernel/build/bin/ckProfiler --input gemm_rectangle_MI100.csv
-          python gpu_benchmark_tool.py --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --composable_kernel $(System.DefaultWorkingDirectory)/composable_kernel/build/bin/ckProfiler --input gemm_square.csv
-          python gpu_benchmark_tool.py --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --composable_kernel $(System.DefaultWorkingDirectory)/composable_kernel/build/bin/ckProfiler --input gemm_bert_assorted.csv
-          python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --composable_kernel $(System.DefaultWorkingDirectory)/composable_kernel/build/bin/ckProfiler --input gemm_rectangle_MI100.csv
-          python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --composable_kernel $(System.DefaultWorkingDirectory)/composable_kernel/build/bin/ckProfiler --input gemm_square.csv
-          python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --composable_kernel $(System.DefaultWorkingDirectory)/composable_kernel/build/bin/ckProfiler --input gemm_bert_assorted.csv
-          python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --composable_kernel $(System.DefaultWorkingDirectory)/composable_kernel/build/bin/ckProfiler --input gemm_resnet_inception.csv
+          python gpu_benchmark_tool.py --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --composable_kernel $(System.DefaultWorkingDirectory)/composable_kernel/build/bin/ckProfiler --input gemm_rectangle_MI100.csv,gemm_square.csv,gemm_bert_assorted.csv
+          python gpu_benchmark_tool.py --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --composable_kernel $(System.DefaultWorkingDirectory)/composable_kernel/build/bin/ckProfiler --input gemm_rectangle_MI100.csv,gemm_square.csv,gemm_bert_assorted.csv,gemm_resnet_inception.csv
         displayName: Run CK benchmarks (FP16, RECT)
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:
diff --git a/.azure/rocm/rocm-benchmark-fp16-bert.yml b/.azure/rocm/rocm-benchmark-fp16-bert.yml
index 003d3420..1cf846ef 100644
--- a/.azure/rocm/rocm-benchmark-fp16-bert.yml
+++ b/.azure/rocm/rocm-benchmark-fp16-bert.yml
@@ -43,9 +43,6 @@ jobs:
         displayName: Python build
         workingDirectory: "$(Build.SourcesDirectory)"
 
-      # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to
-      # a comma-separated list of device IDs
-      # e.g. VISIBLE_DEVICES="0, 3"
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
           export LD_LIBRARY_PATH=${ROCM_PATH}/lib
diff --git a/.azure/rocm/rocm-benchmark-fp16-big.yml b/.azure/rocm/rocm-benchmark-fp16-big.yml
index 575968af..2bb82b1e 100644
--- a/.azure/rocm/rocm-benchmark-fp16-big.yml
+++ b/.azure/rocm/rocm-benchmark-fp16-big.yml
@@ -47,18 +47,9 @@ jobs:
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
           export LD_LIBRARY_PATH=${ROCM_PATH}/lib
           echo "LD_LIBRARY_PATH" ${LD_LIBRARY_PATH}
-          python gpu_benchmark_tool.py --input gemm_big_MI100.csv --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True
+          python gpu_benchmark_tool.py --input gemm_big_MI100.csv,gemm_big.csv --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True
         displayName: Run fp16 benchmarks BIG MI100
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:
           ACCOUNT_KEY: $(ACCOUNT_KEY)
 
-      - bash: |
-          export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          export LD_LIBRARY_PATH=${ROCM_PATH}/lib
-          echo "LD_LIBRARY_PATH" ${LD_LIBRARY_PATH}
-          python gpu_benchmark_tool.py --input gemm_big.csv --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True
-        displayName: Run fp16 benchmarks BIG
-        workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
-        env:
-          ACCOUNT_KEY: $(ACCOUNT_KEY)
diff --git a/.azure/rocm/rocm-benchmark-fp16.yml b/.azure/rocm/rocm-benchmark-fp16.yml
index 19c36bac..280ab905 100644
--- a/.azure/rocm/rocm-benchmark-fp16.yml
+++ b/.azure/rocm/rocm-benchmark-fp16.yml
@@ -47,18 +47,9 @@ jobs:
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
           export LD_LIBRARY_PATH=${ROCM_PATH}/lib
           echo "LD_LIBRARY_PATH" ${LD_LIBRARY_PATH}
-          python gpu_benchmark_tool.py --input gemm_small_MI100.csv --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True
+          python gpu_benchmark_tool.py --input gemm_small_MI100.csv,gemm_small.csv --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True
         displayName: Run fp16 benchmarks MI100
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:
           ACCOUNT_KEY: $(ACCOUNT_KEY)
 
-      - bash: |
-          export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          export LD_LIBRARY_PATH=${ROCM_PATH}/lib
-          echo "LD_LIBRARY_PATH" ${LD_LIBRARY_PATH}
-          python gpu_benchmark_tool.py --input gemm_small.csv --type h --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True
-        displayName: Run fp16 benchmarks
-        workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
-        env:
-          ACCOUNT_KEY: $(ACCOUNT_KEY)
diff --git a/.azure/rocm/rocm-benchmark-fp32-bert.yml b/.azure/rocm/rocm-benchmark-fp32-bert.yml
index 588cedc3..7db56566 100644
--- a/.azure/rocm/rocm-benchmark-fp32-bert.yml
+++ b/.azure/rocm/rocm-benchmark-fp32-bert.yml
@@ -43,9 +43,6 @@ jobs:
         displayName: Python build
         workingDirectory: "$(Build.SourcesDirectory)"
 
-      # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to
-      # a comma-separated list of device IDs
-      # e.g. VISIBLE_DEVICES="0, 3"
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
           export LD_LIBRARY_PATH=${ROCM_PATH}/lib
diff --git a/.azure/rocm/rocm-benchmark-fp32-big.yml b/.azure/rocm/rocm-benchmark-fp32-big.yml
index b067ac19..05102fd0 100644
--- a/.azure/rocm/rocm-benchmark-fp32-big.yml
+++ b/.azure/rocm/rocm-benchmark-fp32-big.yml
@@ -47,19 +47,10 @@ jobs:
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
           export LD_LIBRARY_PATH=${ROCM_PATH}/lib
           echo "LD_LIBRARY_PATH" ${LD_LIBRARY_PATH}
-          python gpu_benchmark_tool.py --input gemm_big_MI100.csv --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True
+          python gpu_benchmark_tool.py --input gemm_big_MI100.csv,gemm_big.csv --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True
         displayName: Run fp32 benchmarks BIG MI100
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:
           ACCOUNT_KEY: $(ACCOUNT_KEY)
 
-      - bash: |
-          export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          export LD_LIBRARY_PATH=${ROCM_PATH}/lib
-          echo "LD_LIBRARY_PATH" ${LD_LIBRARY_PATH}
-          python gpu_benchmark_tool.py --input gemm_big.csv --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True
-        displayName: Run fp32 benchmarks BIG
-        workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
-        env:
-          ACCOUNT_KEY: $(ACCOUNT_KEY)
 
diff --git a/.azure/rocm/rocm-benchmark-fp32-resnet.yml b/.azure/rocm/rocm-benchmark-fp32-resnet.yml
index 615e7e7d..44fdcb1e 100644
--- a/.azure/rocm/rocm-benchmark-fp32-resnet.yml
+++ b/.azure/rocm/rocm-benchmark-fp32-resnet.yml
@@ -43,9 +43,6 @@ jobs:
         displayName: Python build
         workingDirectory: "$(Build.SourcesDirectory)"
 
-      # VISIBLE_DEVICES can be overwritten at Pipeline scheduling time to
-      # a comma-separated list of device IDs
-      # e.g. VISIBLE_DEVICES="0, 3"
       - bash: |
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
           export LD_LIBRARY_PATH=${ROCM_PATH}/lib
diff --git a/.azure/rocm/rocm-benchmark-fp32.yml b/.azure/rocm/rocm-benchmark-fp32.yml
index dfd1b80f..e98f93a4 100644
--- a/.azure/rocm/rocm-benchmark-fp32.yml
+++ b/.azure/rocm/rocm-benchmark-fp32.yml
@@ -47,19 +47,8 @@ jobs:
           export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
           export LD_LIBRARY_PATH=${ROCM_PATH}/lib
           echo "LD_LIBRARY_PATH" ${LD_LIBRARY_PATH}
-          python gpu_benchmark_tool.py --input gemm_small_MI100.csv --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
+          python gpu_benchmark_tool.py --input gemm_small_MI100.csv,gemm_small.csv --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
         displayName: Run fp32 benchmarks MI100
         workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
         env:
           ACCOUNT_KEY: $(ACCOUNT_KEY)
-
-      - bash: |
-          export PYTHONPATH=$(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8
-          export LD_LIBRARY_PATH=${ROCM_PATH}/lib
-          echo "LD_LIBRARY_PATH" ${LD_LIBRARY_PATH}
-          python gpu_benchmark_tool.py --input gemm_small.csv --type s --target 'AMD MI100' --branch $(Build.SourceBranch) --output $(Build.SourcesDirectory)/build/lib.linux-x86_64-3.8/accera_benchmarks/results --upload official_build_container_DO_NOT_UPLOAD_HERE --janitor True --verbose True --check True
-        displayName: Run fp32 benchmarks
-        workingDirectory: "$(Build.SourcesDirectory)/tools/benchmarkers"
-        env:
-          ACCOUNT_KEY: $(ACCOUNT_KEY)
-
diff --git a/accera/acc-opt/test/thrifty_caching.mlir b/accera/acc-opt/test/thrifty_caching.mlir
index c9c9aa56..335c7e25 100644
--- a/accera/acc-opt/test/thrifty_caching.mlir
+++ b/accera/acc-opt/test/thrifty_caching.mlir
@@ -17,10 +17,10 @@ module @test_thrifty_caching_simple_input_cache attributes {llvm.data_layout = "
       %4 = accln.sym_index {name = "k_i"} #accln<"index{k_i,8}">
       %5 = accln.sym_index {name = "j_o"} #accln<"index{j_o,5}">
       "accv.lambda"() ( {
-        %6 = "accxp.make_cache"() {memorySpace = 0 : i64, multiCacheAccessIndices = [], offsetAccessIndices = [], offsetArrayToCacheAccessMap = affine_map<(d0) -> (d0)>} : () -> memref<?xf32, 3>
+        %6 = "accxp.make_cache"() {activeBlockToCacheMap = affine_map<(d0, d1) -> (d0, d1)>, memorySpace = 0 : i64, multiCacheAccessIndices = [], offsetAccessIndices = [], offsetArrayToCacheAccessMap = affine_map<(d0) -> (d0)>} : () -> memref<?xf32, 3>
         %7 = "accxp.begin_create_cache"(%arg0, %6, %arg0, %1, %2, %0, %4, %1, %2) {activeBlockCache, cacheAccessMaps = {manualCacheDimOrder = [0, 1]}, cacheHierarchyLevel = 0 : i64, cacheIndex = #accln<"index{i_i,4}">, cacheRegionBaseIndices = [[#accln<"index{i,0}">], [#accln<"index{k,2}">]], cacheRegionRelevantIndexRanges = [#accln<"indexrange{i_i,4}={0:4:1}">, #accln<"indexrange{k_i,8}={0:32:1}">], dimReorderCache, id = 0 : i64, operand_segment_sizes = dense<[1, 1, 1, 4, 2]> : vector<5xi32>, thrifty, triggerIndex = #accln<"index{i_i,4}">} : (memref<32x32xf32, affine_map<(d0, d1) -> (d0 * 32 + d1)>>, memref<?xf32, 3>, memref<32x32xf32, affine_map<(d0, d1) -> (d0 * 32 + d1)>>, index, index, index, index, index, index) -> index
         "accxp.end_cache_region"(%7) : (index) -> ()
-        %8 = "accxp.make_cache"() {memorySpace = 0 : i64, multiCacheAccessIndices = [], offsetAccessIndices = [], offsetArrayToCacheAccessMap = affine_map<(d0) -> (d0)>} : () -> memref<?xf32, 3>
+        %8 = "accxp.make_cache"() {activeBlockToCacheMap = affine_map<(d0, d1) -> (d0, d1)>, memorySpace = 0 : i64, multiCacheAccessIndices = [], offsetAccessIndices = [], offsetArrayToCacheAccessMap = affine_map<(d0) -> (d0)>} : () -> memref<?xf32, 3>
         %9 = "accxp.begin_create_cache"(%arg1, %8, %arg1, %5, %2, %3, %4, %5) {activeBlockCache, cacheAccessMaps = {manualCacheDimOrder = [0, 1]}, cacheHierarchyLevel = 0 : i64, cacheIndex = #accln<"index{k_o,7}">, cacheRegionBaseIndices = [[#accln<"index{k,2}">], [#accln<"index{j,1}">], [#accln<"index{k,2}">]], cacheRegionRelevantIndexRanges = [#accln<"indexrange{k_o,7}={0:32:32}">, #accln<"indexrange{j_i,6}={0:16:1}">, #accln<"indexrange{k_i,8}={0:32:1}">], dimReorderCache, id = 1 : i64, operand_segment_sizes = dense<[1, 1, 1, 4, 1]> : vector<5xi32>, thrifty, triggerIndex = #accln<"index{k_o,7}">} : (memref<32x32xf32, affine_map<(d0, d1) -> (d0 * 32 + d1)>>, memref<?xf32, 3>, memref<32x32xf32, affine_map<(d0, d1) -> (d0 * 32 + d1)>>, index, index, index, index, index) -> index
         "accxp.end_cache_region"(%9) : (index) -> ()
         affine.for %arg3 = 0 to 32 step 4 {
diff --git a/accera/acc-opt/test/value_mlir_test.cpp b/accera/acc-opt/test/value_mlir_test.cpp
index 985470c0..4dc6d45f 100644
--- a/accera/acc-opt/test/value_mlir_test.cpp
+++ b/accera/acc-opt/test/value_mlir_test.cpp
@@ -1735,7 +1735,7 @@ TEST_CASE("jit_int8_simple_matrix_multiply_test1")
                                  });
 
                                  auto computeKernel = Kernel("compute", [&, i = i, j = j, k = k]() {
-                                     auto a = UnsignedCast(A(i, k), cType);
+                                     auto a = Cast(A(i, k), cType);
                                      auto b = Cast(B(k, j), cType);
                                      auto prod = a * b;
 
@@ -1860,7 +1860,7 @@ TEST_CASE("jit_int8_simple_matrix_multiply_test3")
                              auto [kOuter, kInner] = schedule.Split(k, kernelK);
 
                              auto computeKernel = Kernel("compute", [&, i = i, j = j, k = k]() {
-                                 auto a = UnsignedCast(A(i, k), cType);
+                                 auto a = Cast(A(i, k), cType);
                                  auto b = Cast(B(k, j), cType);
                                  auto prod = a * b;
                                  C(i, j) += prod;
diff --git a/accera/ir/include/exec/ExecutionPlanOps.td b/accera/ir/include/exec/ExecutionPlanOps.td
index 6d164f04..2899313a 100644
--- a/accera/ir/include/exec/ExecutionPlanOps.td
+++ b/accera/ir/include/exec/ExecutionPlanOps.td
@@ -101,6 +101,7 @@ def accxp_MakeCacheOp : accxp_Op<"make_cache"> {
     corresponding position in the cache.
   }];
   let arguments = (ins MemorySpaceAttr:$memorySpace,
+                   AffineMapAttr:$activeBlockToCacheMap,
                    AffineMapAttr:$offsetArrayToCacheAccessMap,
                    ArrayAttr:$offsetAccessIndices,
                    ArrayAttr:$multiCacheAccessIndices);
@@ -112,6 +113,7 @@ def accxp_MakeCacheOp : accxp_Op<"make_cache"> {
     OpBuilder<(ins
       "mlir::MemRefType":$cache,
       "MemorySpace":$memorySpace,
+      "AffineMap":$activeBlockToCacheMap,
       "AffineMap":$offsetArrayToCacheAccessMap,
       "const std::vector<Index>&":$offsetAccessIndices,
       "const std::vector<Index>&":$multiCacheAccessIndices
@@ -184,7 +186,7 @@ def accxp_ActiveBlockCacheCopyOp : accxp_Op<"active_block_cache_copy", [AttrSize
                         AffineMapAttr:$activeBlockToCacheMap,
                         UnitAttr:$toCache,
                         UnitAttr:$thrifty,
-                        UnitAttr:$skipBarriers, // TODO : remove this once barrier analysis hoists barriers out of loops
+                        UnitAttr:$skipBarriers, // TODO : remove this once barrier analysis hoists barriers out of loops 
                         OptionalAttr<accxp_VectorizationInfoAttr>:$vectorizationInfo);
 }
 
diff --git a/accera/ir/include/value/ValueOps.td b/accera/ir/include/value/ValueOps.td
index 663cb305..71a6fb47 100644
--- a/accera/ir/include/value/ValueOps.td
+++ b/accera/ir/include/value/ValueOps.td
@@ -1217,12 +1217,23 @@ def accv_MMALoadSyncOp : accv_Op<"wmma_load_sync",
     UI32Attr:$mmaShapeType,
     I8Attr:$operandType,
     Variadic<acc_Indexlike>:$indices,
-    AffineMapAttr:$map
+    AffineMapAttr:$map, // TODO : maybe rename this positionMap or something?
+    OptionalAttr<AffineMapAttr>:$tileAccessMap
   );
 
   let results = (outs MemRefRankOf<[I8, I32, F16, F32], [1]>:$result);
 
   let builders = [
+    OpBuilder<(ins "Type":$resultType, "Value":$memref, "MMAShape":$mmaShapeType, "MMAOperandType":$operandType, "AffineMap":$map, "ValueRange":$mapOperands, "AffineMap":$tileAccessMap), [{
+        assert(map.getNumInputs() == mapOperands.size() && "inconsistent index info");
+        $_state.addOperands(memref);
+        $_state.addOperands(mapOperands);
+        $_state.addAttribute("mmaShapeType", $_builder.getUI32IntegerAttr((uint32_t)mmaShapeType));
+        $_state.addAttribute("operandType", $_builder.getI8IntegerAttr((int8_t)operandType));
+        $_state.addAttribute(getMapAttrName(), AffineMapAttr::get(map));
+        $_state.addAttribute(getTileAccessMapAttrName(), AffineMapAttr::get(tileAccessMap));
+        $_state.addTypes(resultType);
+    }]>,
     OpBuilder<(ins "Type":$resultType, "Value":$memref, "MMAShape":$mmaShapeType, "MMAOperandType":$operandType, "AffineMap":$map, "ValueRange":$mapOperands), [{
         assert(map.getNumInputs() == mapOperands.size() && "inconsistent index info");
         $_state.addOperands(memref);
@@ -1255,6 +1266,7 @@ def accv_MMALoadSyncOp : accv_Op<"wmma_load_sync",
     }
 
     static StringRef getMapAttrName() { return "map"; }
+    static StringRef getTileAccessMapAttrName() { return "tileAccessMap"; }
   }];
 
   let assemblyFormat = [{
@@ -1283,9 +1295,20 @@ def accv_MMAStoreSyncOp : accv_Op<"wmma_store_sync", [
         Arg<MemRefRankOf<[I8, I32, F16, F32], [2]>, "",[MemWrite]>:$memref,
         UI32Attr:$mmaShapeType,
         Variadic<acc_Indexlike>:$indices,
-        AffineMapAttr:$map);
+        AffineMapAttr:$map, // TODO : maybe rename this positionMap or something?
+        OptionalAttr<AffineMapAttr>:$tileAccessMap);
 
   let builders = [
+    OpBuilder<(ins "Value":$src, "Value":$memref, "MMAShape":$mmaShapeType, "AffineMap":$map, "ValueRange":$mapOperands, "AffineMap":$tileAccessMap), [{
+        assert(map.getNumInputs() == mapOperands.size() && "inconsistent index info");
+        $_state.addOperands(src);
+        $_state.addOperands(memref);
+        $_state.addOperands(mapOperands);
+        $_state.addAttribute("mmaShapeType", $_builder.getUI32IntegerAttr((uint32_t)mmaShapeType));
+        $_state.addAttribute(getMapAttrName(), AffineMapAttr::get(map));
+        $_state.addAttribute(getTileAccessMapAttrName(), AffineMapAttr::get(tileAccessMap));
+        
+    }]>,
     OpBuilder<(ins "Value":$src, "Value":$memref, "MMAShape":$mmaShapeType, "AffineMap":$map, "ValueRange":$mapOperands), [{
         assert(map.getNumInputs() == mapOperands.size() && "inconsistent index info");
         $_state.addOperands(src);
@@ -1320,6 +1343,7 @@ def accv_MMAStoreSyncOp : accv_Op<"wmma_store_sync", [
     }
 
     static StringRef getMapAttrName() { return "map"; }
+    static StringRef getTileAccessMapAttrName() { return "tileAccessMap"; }
   }];
 
   let assemblyFormat = [{
diff --git a/accera/ir/src/exec/ExecutionPlanOps.cpp b/accera/ir/src/exec/ExecutionPlanOps.cpp
index 89fdb2a7..a4640bd4 100644
--- a/accera/ir/src/exec/ExecutionPlanOps.cpp
+++ b/accera/ir/src/exec/ExecutionPlanOps.cpp
@@ -974,6 +974,7 @@ namespace executionPlan
             cacheType,
             memorylocation,
             mlir::AffineMap::getMultiDimIdentityMap(cacheType.getRank(), builder.getContext()),
+            mlir::AffineMap::getMultiDimIdentityMap(cacheType.getRank(), builder.getContext()),
             std::vector<Index>{},
             std::vector<Index>{});
     }
@@ -982,6 +983,7 @@ namespace executionPlan
                             OperationState& result,
                             mlir::MemRefType cacheType,
                             accera::ir::value::MemorySpace memorylocation,
+                            AffineMap activeBlockToCacheMap,
                             AffineMap offsetArrayToCacheAccessMap,
                             const std::vector<Index>& offsetAccessIndices,
                             const std::vector<Index>& multiCacheAccessIndices)
@@ -993,6 +995,7 @@ namespace executionPlan
               result,
               cacheType,
               memorylocation,
+              activeBlockToCacheMap,
               offsetArrayToCacheAccessMap,
               offsetAccessIndexAttrs,
               multiCacheAccessIndexAttrs);
diff --git a/accera/python/accera/__init__.py b/accera/python/accera/__init__.py
index f4488f4b..38482454 100644
--- a/accera/python/accera/__init__.py
+++ b/accera/python/accera/__init__.py
@@ -18,7 +18,7 @@
 from ._lang_python import CompilerOptions, ScalarType, _GetTargetDeviceFromName
 from ._lang_python import (
     abs, max, min, ceil, floor, sqrt, exp, log, log10, log2, sin, cos, tan, sinh, cosh, tanh, logical_and, logical_or,
-    logical_not, _cast, _unsigned_cast
+    logical_not, cast
 )
 
 # Global initialization
diff --git a/accera/python/accera/test/dsl_tests.py b/accera/python/accera/test/dsl_tests.py
index c368de37..e4dd5638 100644
--- a/accera/python/accera/test/dsl_tests.py
+++ b/accera/python/accera/test/dsl_tests.py
@@ -438,7 +438,8 @@ def _():
 
     def test_array_value_type_cast(self) -> None:
         A = Array(
-            shape=(256, 32), role=Array.Role.INPUT, layout=Array.Layout.FIRST_MAJOR
+            shape=(256, 32), role=Array.Role.INPUT, layout=Array.Layout.FIRST_MAJOR,
+            element_type=ScalarType.float32,
         )
         B = Array(
             shape=(256, 32),
@@ -452,7 +453,7 @@ def test_array_value_type_cast(self) -> None:
 
         @nest.iteration_logic
         def _():
-            A[i, j] = 5  # implicit cast from int8 to float
+            A[i, j] = 5  # implicit cast from int8 to float32
             B[i, j] = 10  # implicit cast from int8 to int32
 
         A_test = np.random.random((256, 32)).astype(np.float32)
@@ -691,12 +692,36 @@ def test_arithmetic_operations(self) -> None:
             nest, A, B, C = self._create_nest((16, 10, 11), type=t)
             i, j, k = nest.get_indices()
 
+            int_val = 2
+            float_val = 1.5
+
             @nest.iteration_logic
             def _():
                 C[i, j] = A[i, k] + B[k, j]  # test assignment
                 C[i, j] += A[i, k] - B[k, j]
                 C[i, j] += A[i, k] * B[k, j]
                 C[i, j] += A[i, k] / B[k, j]
+
+                if t != ScalarType.float16:
+                    C[i, j] += int_val + A[i,k]
+                    C[i, j] += int_val - A[i, k]
+                    C[i, j] += int_val * A[i,k]
+                    C[i, j] += int_val / A[i, k]
+                    C[i, j] += A[i, k] + int_val
+                    C[i, j] += A[i, k] - int_val
+                    C[i, j] += A[i, k] * int_val
+                    C[i, j] += A[i, k] / int_val
+
+                if t in FLOAT_TYPES:
+                    C[i, j] += float_val + A[i,k]
+                    C[i, j] += float_val - A[i, k]
+                    C[i, j] += float_val * A[i,k]
+                    C[i, j] += float_val / A[i, k]
+                    C[i, j] += A[i, k] + float_val
+                    C[i, j] += A[i, k] - float_val
+                    C[i, j] += A[i, k] * float_val
+                    C[i, j] += A[i, k] / float_val
+
                 C[i, j] += -A[i, k]
                 C[i, j] += A[i, k] // B[k, j]
                 C[i, j] += A[i, k] % B[k, j]
diff --git a/accera/python/accera/test/mfma_tests.py b/accera/python/accera/test/mfma_tests.py
index 0d215db2..5ef24c37 100644
--- a/accera/python/accera/test/mfma_tests.py
+++ b/accera/python/accera/test/mfma_tests.py
@@ -99,7 +99,7 @@ def _check_cu_has_no_mfma(self, test_name, verifier):
         )
         checker.check_not('__builtin_amdgcn_mfma_')
         checker.run()
-        
+
     def _get_np_datatype(self, p):
         from bfloat16 import bfloat16
         if p.element_type == ScalarType.bfloat16:
@@ -109,7 +109,8 @@ def _get_np_datatype(self, p):
 
     def _get_random_data(self, p):
         datatype = self._get_np_datatype(p)
-        if p.element_type == ScalarType.int8 or p.element_type == ScalarType.int32:
+        if p.element_type in [ScalarType.int8, ScalarType.int16, ScalarType.int32, ScalarType.int64, ScalarType.uint8,
+                              ScalarType.uint16, ScalarType.uint32, ScalarType.uint64]:
             return np.random.randint(-2, 2, p.shape, datatype)
 
         return np.random.random(p.shape).astype(datatype)
@@ -183,7 +184,7 @@ def _rocm_matmul(self, test_name, M, N, K, block_tile, outer_tile_k, thread_tile
 
         if thread_tile is None:
             thread_tile = block_tile
-        
+
         if inner_tile_k is None:
             inner_tile_k = outer_tile_k
 
@@ -285,7 +286,7 @@ def _rocm_batch_matmul(self, test_name, batch_count, M, N, K, block_tile, outer_
 
         if thread_tile is None:
             thread_tile = block_tile
-        
+
         if inner_tile_k is None:
             inner_tile_k = outer_tile_k
 
@@ -308,7 +309,7 @@ def _():
                 i: block_tile[0],
                 j: block_tile[1],
                 k: outer_tile_k
-        })
+            })
         else:
             ii, jj, kk = schedule.tile({
                 i: block_tile[0],
@@ -538,7 +539,7 @@ def _():
             i: outer_tile_x,
             j: outer_tile_y
         })
- 
+
         mma_shape = _MMAShape.M16xN16xK4_B1
         num_total_passes = 4
         target = Target(Target.Model.AMD_MI100)
@@ -834,7 +835,7 @@ def _():
             if ROCM_AVAILABLE:
                 self._verify_matmul(function, A, B, C, v)
 
-    
+
     # This should produce MFMA instructions
     def test_rocm_tensorize_multi_block_multi_warp_output_reordered_indices(self) -> None:
         from accera import Array, Nest, Package, ScalarType, Target
@@ -1406,7 +1407,7 @@ def _():
         shutil.rmtree(output_dir, ignore_errors=True)
 
         with verifiers.VerifyPackage(self, test_name, output_dir) as v:
-            package.build(test_name, 
+            package.build(test_name,
                 format=Package.Format.MLIR_DYNAMIC,
                 mode=Package.Mode.RELEASE,
                 output_dir=output_dir
@@ -1561,7 +1562,7 @@ def _():
             j: outer_tile_n,
             k: outer_tile_k
         })
-        
+
         target = Target(Target.Model.NVIDIA_RTX_A6000)
         if tensorize:
             tensor_splits = target.tensor_core_info.compute_tensor_splits(mma_shape, num_total_passes)
@@ -1583,7 +1584,7 @@ def _():
                 kk: default_thread_splits[2]
             })
             schedule.reorder(i, j, k, ii, jj, kk, iii, jjj, kkk)
-            
+
             plan = schedule.create_plan(target=target)
             plan.bind(
                 mapping={
@@ -1643,7 +1644,7 @@ def test_cuda_non_square_last_major_output(self) -> None:
     def test_cuda_non_square_last_major_inputs_output(self) -> None:
         self._cuda_cache_tensorize(M=1280, N=768, K=1024, outer_tile_m=16, outer_tile_n=16, outer_tile_k=128,
                                    test_name="test_cuda_non_square_last_major_inputs_output", tensorize=False,
-                                   cache=False, vectorize=False, element_type=ScalarType.float32, 
+                                   cache=False, vectorize=False, element_type=ScalarType.float32,
                                    array_layouts=[Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR])
 
     def test_cuda_tensorize_non_square_last_major_inputs(self) -> None:
@@ -2302,6 +2303,197 @@ def test_batchgemm_rocm_vectorized_cache_double_buffering_tensorize_square_bspli
                                 test_name="test_batchgemm_rocm_vectorized_cache_double_buffering_tensorize_square_bsplit",
                                 double_buffer=True, double_buffer_location=_MemorySpace.PRIVATE, vectorize=True, use_static_offsets=False)
 
+    def _test_rocm_cache_memory_order_helper(self, a_layout, a_cache_layout, double_buffer, vectorize, tensorize) -> None:
+
+        from accera import Array, Nest, Package, ScalarType, Target
+
+        M = 512
+        N = 512
+        K = 512
+
+        # Pick the A and B tile sizes to be smaller than the number of threads per block
+        outer_tile_x = 64
+        outer_tile_y = 64
+        outer_tile_k = 64
+
+        A = Array(role=Array.Role.INPUT, element_type=ScalarType.float32, shape=(M, K), layout=a_layout)
+        B = Array(role=Array.Role.INPUT, element_type=ScalarType.float32, shape=(K, N), layout=Array.Layout.FIRST_MAJOR)
+        C = Array(role=Array.Role.INPUT_OUTPUT, element_type=ScalarType.float32, shape=(M, N), layout=Array.Layout.FIRST_MAJOR)
+
+        nest = Nest(shape=(M, N, K))
+        i, j, k = nest.get_indices()
+
+        @nest.iteration_logic
+        def _():
+            C[i, j] += A[i, k] * B[k, j]
+
+        schedule = nest.create_schedule()
+
+        ii, jj, kk = schedule.tile({
+            i: outer_tile_x,
+            j: outer_tile_y,
+            k: outer_tile_k
+        })
+
+        mma_shape = _MMAShape.M16xN16xK4_B1
+        num_total_passes = 4
+        target = Target(Target.Model.AMD_MI100)
+        tensor_splits = target.tensor_core_info.compute_tensor_splits(mma_shape, num_total_passes=num_total_passes)
+
+        iii, jjj, kkk = schedule.tile({
+            ii: tensor_splits[0],
+            jj: tensor_splits[1],
+            kk: tensor_splits[2]
+        })
+        if tensorize:
+            outer_nest_order = (i, j, k, ii, jj, kk)
+            plan, tensorization_indices = schedule._create_tensorizable_plan(target, block_indices=(i, j), warp_indices=(ii, jj), tensor_indices=(iii, jjj, kkk), outer_nest_order=outer_nest_order, mma_shape=mma_shape)
+            plan.tensorize(indices=tensorization_indices, mma_shape=mma_shape, num_total_passes=num_total_passes)
+        else:
+            schedule.reorder(i, j, k, ii, jj, kk, iii, jjj, kkk)
+            plan = schedule.create_plan(target)
+            plan.bind(
+                mapping={
+                    i: target.GridUnit.BLOCK_Y,
+                    j: target.GridUnit.BLOCK_X,
+                    iii: target.GridUnit.THREAD_Y,
+                    jjj: target.GridUnit.THREAD_X
+                }
+            )
+
+        plan.cache(A,
+            index=ii,
+            double_buffer=double_buffer,
+            vectorize=vectorize,
+            location=target.MemorySpace.SHARED,
+            layout=a_cache_layout
+        )
+
+        layout_str_map = {
+            Array.Layout.FIRST_MAJOR : "F",
+            Array.Layout.LAST_MAJOR : "L"
+        }
+        name_parts = [
+            "test_rocm_cache_tensorized",
+            layout_str_map[a_layout],
+            layout_str_map[a_cache_layout],
+            f"_db_{double_buffer}",
+            f"_vec_{vectorize}",
+            f"_tens_{tensorize}"
+        ]
+        test_name = "_".join(name_parts)
+        package = Package()
+        function = package.add(plan, args=(A, B, C), base_name=test_name)
+
+        self._verify_matrix_multiplication_function(
+            function,
+            package,
+            test_name,
+            check_correctness=ROCM_AVAILABLE,
+            file_list=[f"{test_name}.cu", f"{test_name}.hat"],
+            package_format=Package.Format.DEFAULT | Package.Format.MLIR
+        )
+
+    # FIRST-FIRST
+    def test_rocm_memory_order_cache_tensorized_F_F_T_T_T(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.FIRST_MAJOR, True, True, True)
+
+    def test_rocm_memory_order_cache_tensorized_F_F_T_T_F(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.FIRST_MAJOR, True, True, False)
+
+    def test_rocm_memory_order_cache_tensorized_F_F_T_F_T(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.FIRST_MAJOR, True, False, True)
+
+    def test_rocm_memory_order_cache_tensorized_F_F_F_T_T(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.FIRST_MAJOR, False, True, True)
+
+    def test_rocm_memory_order_cache_tensorized_F_F_T_F_F(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.FIRST_MAJOR, True, False, False)
+
+    def test_rocm_memory_order_cache_tensorized_F_F_F_F_T(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.FIRST_MAJOR, False, False, True)
+
+    def test_rocm_memory_order_cache_tensorized_F_F_F_T_F(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.FIRST_MAJOR, False, True, False)
+
+    def test_rocm_memory_order_cache_tensorized_F_F_F_F_F(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.FIRST_MAJOR, False, False, False)
+
+    # FIRST-LAST
+    def test_rocm_memory_order_cache_tensorized_F_L_T_T_T(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.LAST_MAJOR, True, True, True)
+
+    def test_rocm_memory_order_cache_tensorized_F_L_T_T_F(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.LAST_MAJOR, True, True, False)
+
+    def test_rocm_memory_order_cache_tensorized_F_L_T_F_T(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.LAST_MAJOR, True, False, True)
+
+    def test_rocm_memory_order_cache_tensorized_F_L_F_T_T(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.LAST_MAJOR, False, True, True)
+
+    def test_rocm_memory_order_cache_tensorized_F_L_T_F_F(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.LAST_MAJOR, True, False, False)
+
+    def test_rocm_memory_order_cache_tensorized_F_L_F_F_T(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.LAST_MAJOR, False, False, True)
+
+    def test_rocm_memory_order_cache_tensorized_F_L_F_T_F(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.LAST_MAJOR, False, True, False)
+
+    def test_rocm_memory_order_cache_tensorized_F_L_F_F_F(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.FIRST_MAJOR, Array.Layout.LAST_MAJOR, False, False, False)
+
+    # LAST-FIRST
+    def test_rocm_memory_order_cache_tensorized_L_F_T_T_T(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.FIRST_MAJOR, True, True, True)
+
+    def test_rocm_memory_order_cache_tensorized_L_F_T_T_F(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.FIRST_MAJOR, True, True, False)
+
+    def test_rocm_memory_order_cache_tensorized_L_F_T_F_T(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.FIRST_MAJOR, True, False, True)
+
+    def test_rocm_memory_order_cache_tensorized_L_F_F_T_T(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.FIRST_MAJOR, False, True, True)
+
+    def test_rocm_memory_order_cache_tensorized_L_F_T_F_F(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.FIRST_MAJOR, True, False, False)
+
+    def test_rocm_memory_order_cache_tensorized_L_F_F_F_T(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.FIRST_MAJOR, False, False, True)
+
+    def test_rocm_memory_order_cache_tensorized_L_F_F_T_F(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.FIRST_MAJOR, False, True, False)
+
+    def test_rocm_memory_order_cache_tensorized_L_F_F_F_F(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.FIRST_MAJOR, False, False, False)
+
+    # LAST-LAST
+    def test_rocm_memory_order_cache_tensorized_L_L_T_T_T(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR, True, True, True)
+
+    def test_rocm_memory_order_cache_tensorized_L_L_T_T_F(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR, True, True, False)
+
+    def test_rocm_memory_order_cache_tensorized_L_L_T_F_T(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR, True, False, True)
+
+    def test_rocm_memory_order_cache_tensorized_L_L_F_T_T(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR, False, True, True)
+
+    def test_rocm_memory_order_cache_tensorized_L_L_T_F_F(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR, True, False, False)
+
+    def test_rocm_memory_order_cache_tensorized_L_L_F_F_T(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR, False, False, True)
+
+    def test_rocm_memory_order_cache_tensorized_L_L_F_T_F(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR, False, True, False)
+
+    def test_rocm_memory_order_cache_tensorized_L_L_F_F_F(self) -> None:
+        self._test_rocm_cache_memory_order_helper(Array.Layout.LAST_MAJOR, Array.Layout.LAST_MAJOR, False, False, False)
+
 
 if __name__ == '__main__':
     unittest.main(verbosity=10)
diff --git a/accera/python/accera/test/smoke_tests.py b/accera/python/accera/test/smoke_tests.py
index bca45001..1ea6716a 100644
--- a/accera/python/accera/test/smoke_tests.py
+++ b/accera/python/accera/test/smoke_tests.py
@@ -735,7 +735,7 @@ def test_two_vulkan_gpu_matmul(self) -> None:
 
     @expectedFailure(FailedReason.NOT_IN_CORE, "function that contains multiple nests")
     def test_int8_matmul(self) -> None:
-        from accera import _cast, _unsigned_cast
+        from accera import cast
 
         # Define our matrix sizes
         M = 128
@@ -756,7 +756,7 @@ def get_compute_col_sums_schedule():
 
             @compute_col_sums_nest.iteration_logic
             def _():
-                b = _unsigned_cast(B[k, j], ScalarType.int32)
+                b = cast(B[k, j], ScalarType.int32)
                 col_sums[j] += b
 
             return compute_col_sums_nest.create_schedule()
@@ -777,7 +777,7 @@ def get_compute_row_sums_schedule():
 
             @compute_row_sums_nest.iteration_logic
             def _():
-                a = _cast(A[i, k], ScalarType.int32)
+                a = cast(A[i, k], ScalarType.int32)
                 row_sums[i] += a
 
             return compute_row_sums_nest.create_schedule()
@@ -818,8 +818,8 @@ def get_matmul_schedule():
 
             @matmul_nest.iteration_logic
             def _():
-                a = _cast(A[i, k], ScalarType.int32)
-                b = _unsigned_cast(B[k, j], ScalarType.int32)
+                a = cast(A[i, k], ScalarType.int32)
+                b = cast(B[k, j], ScalarType.int32)
                 C[i, j] += a * b
 
             return matmul_nest.create_schedule()
@@ -3955,9 +3955,8 @@ def file_check_fn(verifier):
             checker.check('affine.for %[[lpt_iv:[a-z0-9_]+]] = 0 to 2 {')
             checker.check('affine.for %[[Thread_X_iv:[a-z0-9_]+]] = 0 to 1 {')
             checker.check('affine.for %[[Thread_Y_iv:[a-z0-9_]+]] = 0 to 1 {')
-            checker.check('%[[Loaded_A_Val:[0-9_]+]] = affine.load %[[Array_A]][%[[lpt_iv]] * 8 + symbol(%[[Block_X]]) * 16 - (symbol(%[[Block_X]]) floordiv 160) * 2560  + symbol(%[[Thread_Y]]) floordiv 2 - ((%[[lpt_iv]] * 8 + symbol(%[[Thread_Y]]) floordiv 2) floordiv 16) * 16, %[[k_iv]] + %[[kk_iv]] + symbol(%[[Thread_Y]]) * 16 + symbol(%[[Thread_X]]) - (symbol(%[[Thread_Y]]) floordiv 2) * 32] : memref<2560x2048xf32, affine_map<(d0, d1) -> (d0 * 2048 + d1)>>')
-            # Note: (16*thread_y) % 32 == (16*thread_y) - 32((16*thread_y) floordiv 32) == (16*thread_y) - 32(thread_y floordiv 2)
-            checker.check('affine.store %[[Loaded_A_Val]], %[[Cache_A]][(%[[lpt_iv]] * 8 + symbol(%[[Thread_Y]]) floordiv 2) mod 16, symbol(%[[Thread_Y]]) * 16 + symbol(%[[Thread_X]]) - (symbol(%[[Thread_Y]]) floordiv 2) * 32] : memref<16x32xf32, 3>')
+            checker.check('%[[Loaded_A_Val:[0-9_]+]] = affine.load %[[Array_A]][symbol(%[[Block_X]]) * 16 + symbol(%[[Thread_X]]) - (symbol(%[[Block_X]]) floordiv 160) * 2560, %[[lpt_iv]] * 16 + %[[k_iv]] + %[[kk_iv]] + symbol(%[[Thread_Y]])] : memref<2560x2048xf32, affine_map<(d0, d1) -> (d0 * 2048 + d1)>>')
+            checker.check('affine.store %[[Loaded_A_Val]], %[[Cache_A]][symbol(%[[Thread_X]]), %[[lpt_iv]] * 16 + symbol(%[[Thread_Y]])] : memref<16x32xf32, 3>')
 
             # check the B matrix load / store
             checker.check('"accv.lambda"() ( {')
@@ -4518,7 +4517,7 @@ def _():
 
     def test_fill_fp16(self):
         from accera import Array, Nest, Package, ScalarType
-        from accera import _cast
+        from accera import cast
 
         # Define our vector sizes
         N = 2**16
@@ -4530,7 +4529,7 @@ def test_fill_fp16(self):
 
         @nest.iteration_logic
         def _():
-            Out[i] = _cast(2, ScalarType.float16)
+            Out[i] = cast(2, ScalarType.float16)
 
         schedule = nest.create_schedule()
         plan = schedule.create_plan()
@@ -4850,6 +4849,5 @@ def _():
             package_format=Package.Format.DEFAULT | Package.Format.MLIR
         )
 
-
 if __name__ == '__main__':
     unittest.main(verbosity=10)
diff --git a/accera/python/accera/test/unit_tests.py b/accera/python/accera/test/unit_tests.py
index c7ae4fe5..e9330c4e 100644
--- a/accera/python/accera/test/unit_tests.py
+++ b/accera/python/accera/test/unit_tests.py
@@ -101,7 +101,7 @@ def test_scalar_conditionals(self) -> None:
         self.assertIsInstance(s != 10, Scalar)
 
     def test_cast(self) -> None:
-        from accera import _cast, Array, Nest, ScalarType
+        from accera import cast, Array, Nest, ScalarType
         for t in [ScalarType.int8, ScalarType.int16, ScalarType.int32, ScalarType.int64, ScalarType.float16,
                   ScalarType.float32, ScalarType.float64]:
             M, S, N = 16, 11, 10
@@ -114,10 +114,10 @@ def test_cast(self) -> None:
 
             @nest.iteration_logic
             def _():
-                C[i, j] += A[i, k] + _cast(B[k, j], t)
-                C[i, j] += A[i, k] - _cast(B[k, j], t)
-                C[i, j] += A[i, k] * _cast(B[k, j], t)
-                C[i, j] += A[i, k] / _cast(B[k, j], t)
+                C[i, j] += A[i, k] + cast(B[k, j], t)
+                C[i, j] += A[i, k] - cast(B[k, j], t)
+                C[i, j] += A[i, k] * cast(B[k, j], t)
+                C[i, j] += A[i, k] / cast(B[k, j], t)
 
             package = Package()
             package.add(nest, args=(A, B, C), base_name=f"test_cast_{t.name}")
@@ -126,8 +126,8 @@ def _():
                 package.build(package_name, output_dir=TEST_PACKAGE_DIR)
 
     def test_unsigned_cast(self) -> None:
-        from accera import _unsigned_cast, Array, Nest, ScalarType
-        for t in [ScalarType.uint8, ScalarType.uint16]:    # TODO: , ScalarType.uint32, ScalarType.uint64]:
+        from accera import cast, Array, Nest, ScalarType
+        for t in [ScalarType.uint8, ScalarType.uint16, ScalarType.uint32, ScalarType.uint64]:
             M, S, N = 16, 11, 10
             A = Array(role=Array.Role.INPUT, element_type=t, shape=(M, S))
             B = Array(role=Array.Role.INPUT, element_type=ScalarType.int32, shape=(S, N))
@@ -138,10 +138,10 @@ def test_unsigned_cast(self) -> None:
 
             @nest.iteration_logic
             def _():
-                C[i, j] += A[i, k] + _unsigned_cast(B[k, j], t)
-                C[i, j] += A[i, k] - _unsigned_cast(B[k, j], t)
-                C[i, j] += A[i, k] * _unsigned_cast(B[k, j], t)
-                C[i, j] += A[i, k] / _unsigned_cast(B[k, j], t)
+                C[i, j] += A[i, k] + cast(B[k, j], t)
+                C[i, j] += A[i, k] - cast(B[k, j], t)
+                C[i, j] += A[i, k] * cast(B[k, j], t)
+                C[i, j] += A[i, k] / cast(B[k, j], t)
 
             package = Package()
             package.add(nest, args=(A, B, C), base_name=f"test_unsigned_cast_{t.name}")
diff --git a/accera/python/lib/src/ContainerTypes.cpp b/accera/python/lib/src/ContainerTypes.cpp
index 297c6ec2..b4157f53 100644
--- a/accera/python/lib/src/ContainerTypes.cpp
+++ b/accera/python/lib/src/ContainerTypes.cpp
@@ -151,12 +151,9 @@ General constructor.
         module.def("logical_or", [](value::Scalar s1, value::Scalar s2) {
             return value::Cast(s1 || s2, s1.GetType());
         });
-        module.def("_cast", [](value::Scalar s, value::ValueType type) {
+        module.def("cast", [](value::Scalar s, value::ValueType type) {
             return value::Cast(s, type);
         });
-        module.def("_unsigned_cast", [](value::Scalar s, value::ValueType type) {
-            return value::UnsignedCast(s, type);
-        });
     }
 
     void DefineArrayClass(py::module& module)
@@ -358,6 +355,18 @@ specific to the EmitterContext, specified by the Emittable type.
             .def(float() * py::self)
             .def(py::self * int())
             .def(int() * py::self)
+            .def(py::self + float())
+            .def(float() + py::self)
+            .def(py::self + int())
+            .def(int() + py::self)
+            .def(py::self / float())
+            .def(float() / py::self)
+            .def(py::self / int())
+            .def(int() / py::self)
+            .def(py::self - float())
+            .def(float() - py::self)
+            .def(py::self - int())
+            .def(int() - py::self)
             .def("__rshift__", [](value::Scalar& s, value::Scalar& shift) {
                 return value::UnsignedShiftRight(s, value::Cast(shift, s.GetType()));
             })
diff --git a/accera/python/lib/src/SchedulingTypes.cpp b/accera/python/lib/src/SchedulingTypes.cpp
index 49e06428..6feb40bb 100644
--- a/accera/python/lib/src/SchedulingTypes.cpp
+++ b/accera/python/lib/src/SchedulingTypes.cpp
@@ -29,7 +29,7 @@ void DefineScheduleClass(py::module& module)
             "split", [](value::Schedule& sched, value::ScalarIndex& i, int factor) {
                 auto ret = sched.Split(i, factor);
                 using std::swap;
-                swap(i, ret.first);
+                i = std::move(ret.first);
                 return ret.second;
             },
             "i"_a,
diff --git a/accera/transforms/src/exec/ExecutionPlanToAffineLoweringPass.cpp b/accera/transforms/src/exec/ExecutionPlanToAffineLoweringPass.cpp
index fd0a1d9a..03c4ea37 100644
--- a/accera/transforms/src/exec/ExecutionPlanToAffineLoweringPass.cpp
+++ b/accera/transforms/src/exec/ExecutionPlanToAffineLoweringPass.cpp
@@ -18,7 +18,6 @@
 #include <ir/include/value/ValueDialect.h>
 #include <ir/include/value/ValueEnums.h>
 
-#include <mlir/Support/LLVM.h>
 #include <utilities/include/Boolean.h>
 #include <utilities/include/MathUtil.h>
 #include <utilities/include/TypeTraits.h>
@@ -26,6 +25,7 @@
 #include <llvm/ADT/DenseSet.h>
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/ADT/TypeSwitch.h>
+#include <llvm/Support/Casting.h>
 #include <llvm/Support/Debug.h>
 #include <llvm/Support/raw_os_ostream.h>
 
@@ -41,11 +41,13 @@
 #include <mlir/Dialect/Vector/VectorOps.h>
 #include <mlir/IR/Attributes.h>
 #include <mlir/IR/BlockAndValueMapping.h>
+#include <mlir/IR/BuiltinOps.h>
 #include <mlir/IR/Dominance.h>
 #include <mlir/IR/Identifier.h>
 #include <mlir/IR/IntegerSet.h>
 #include <mlir/IR/Operation.h>
 #include <mlir/Pass/Pass.h>
+#include <mlir/Support/LLVM.h>
 #include <mlir/Transforms/DialectConversion.h>
 #include <mlir/Transforms/GreedyPatternRewriteDriver.h>
 #include <mlir/Transforms/InliningUtils.h>
@@ -1724,6 +1726,7 @@ MakeCacheOp UpdateActiveBlockCacheAccess(PatternRewriter& rewriter,
     auto replacementOp = rewriter.create<MakeCacheOp>(shapedMakeCacheOp.getLoc(),
                                                       shapedMakeCacheOp.getType(),
                                                       shapedMakeCacheOp.memorySpace(),
+                                                      activeBlockToCacheMap,
                                                       arrayToCacheMap,
                                                       offsetAccessIndices,
                                                       multiCacheAccessIndices);
@@ -1950,7 +1953,8 @@ v::MMALoadSyncOp CreateMMALoad(mlir::OpBuilder& builder,
     if (auto srcCacheOp = mlir::dyn_cast_or_null<MakeCacheOp>(src.getDefiningOp()))
     {
         mlir::AffineValueMap loadAccessInfo = srcCacheOp.insertCachePosition(builder.getInsertionBlock(), baseArrayPosition);
-        return builder.create<v::MMALoadSyncOp>(loc, resultType, src, mmaShapeType, operandType, loadAccessInfo.getAffineMap(), loadAccessInfo.getOperands());
+        mlir::AffineMap tileAccessMap = srcCacheOp.activeBlockToCacheMap();
+        return builder.create<v::MMALoadSyncOp>(loc, resultType, src, mmaShapeType, operandType, loadAccessInfo.getAffineMap(), loadAccessInfo.getOperands(), tileAccessMap);
     }
     else
     {
@@ -1968,7 +1972,8 @@ v::MMAStoreSyncOp CreateMMAStore(mlir::OpBuilder& builder,
     if (auto dstCacheOp = mlir::dyn_cast_or_null<MakeCacheOp>(dst.getDefiningOp()))
     {
         mlir::AffineValueMap storeAccessInfo = dstCacheOp.insertCachePosition(builder.getInsertionBlock(), baseArrayPosition);
-        return builder.create<v::MMAStoreSyncOp>(loc, value, dst, mmaShapeType, storeAccessInfo.getAffineMap(), storeAccessInfo.getOperands());
+        mlir::AffineMap tileAccessMap = dstCacheOp.activeBlockToCacheMap();
+        return builder.create<v::MMAStoreSyncOp>(loc, value, dst, mmaShapeType, storeAccessInfo.getAffineMap(), storeAccessInfo.getOperands(), tileAccessMap);
     }
     else
     {
@@ -2725,23 +2730,25 @@ LogicalResult ActiveBlockCacheCopyOpRewrite::matchAndRewrite(ActiveBlockCacheCop
 
                 mlir::AffineMap cacheFillNestMap = mlir::AffineMap::get(5, 0, cacheFillNestToFlatExpr);
 
-                llvm::SmallVector<int64_t, 4> multiCacheStrides;
+                llvm::SmallVector<int64_t, 4> outerArrayStrides;
                 int64_t activeBlockOffset; // TODO : do we need to leverage this in any way? we're currently just arranging the threads according to fast/slow dimensions of the logical memref
-                auto strideResult = mlir::getStridesAndOffset(memRefType, multiCacheStrides, activeBlockOffset);
+                auto strideResult = mlir::getStridesAndOffset(memRefType, outerArrayStrides, activeBlockOffset);
                 assert(succeeded(strideResult));
-                auto numMultiCacheDims = multiCacheStrides.size() - activeBlockRank;
-                std::vector<int64_t> activeBlockStrides(multiCacheStrides.begin() + numMultiCacheDims, multiCacheStrides.end());
+                auto numOuterArrayMultiCacheDims = outerArrayStrides.size() - activeBlockRank;
+                std::vector<int64_t> outerArrayActiveBlockStrides(outerArrayStrides.begin() + numOuterArrayMultiCacheDims, outerArrayStrides.end());
 
                 // We want to traverse the dimensions of the active block in increasing stride order, so keep track of the logical dimensions and sort them
                 std::vector<std::pair<size_t, int64_t>> activeBlockLogicalDimAndStride;
                 size_t dimIdxCounter = 0;
-                std::transform(activeBlockStrides.begin(), activeBlockStrides.end(), std::back_inserter(activeBlockLogicalDimAndStride), [&](int64_t stride) {
+                std::transform(outerArrayActiveBlockStrides.begin(), outerArrayActiveBlockStrides.end(), std::back_inserter(activeBlockLogicalDimAndStride), [&](int64_t stride) {
                     return std::make_pair(dimIdxCounter++, stride);
                 });
 
+#if 0 // TODO : re-enable for coalesced reads and fix transpose caching with double-buffering case
                 std::sort(activeBlockLogicalDimAndStride.begin(), activeBlockLogicalDimAndStride.end(), [](const std::pair<size_t, int64_t>& left, const std::pair<size_t, int64_t>& right) {
                     return left.second < right.second;
                 });
+#endif
 
                 auto cumulativeStride = 1;
                 std::vector<mlir::AffineExpr> flatToActiveBlockExprs(activeBlockRank);
@@ -4708,6 +4715,7 @@ MakeCacheOp CreateDoubleBufferTempArray(mlir::OpBuilder& builder,
     return builder.create<MakeCacheOp>(parentLambda.getLoc(),
                                        tempArrayType,
                                        memorySpaceEnum,
+                                       info.activeBlockToCacheMap,
                                        tempArrayAccessMap,
                                        tempArrayOffsetIndices,
                                        tempArrayMultiCacheAccessIndices);
@@ -6437,6 +6445,29 @@ LogicalResult TensorizeAffineForOpConversion::matchAndRewrite(AffineForOp affine
 
     // 4. load C
     (void)innerLoopBodyIter++;
+    mlir::Operation* castOp = nullptr;
+    // TODO: Figure out if there's a better way to list the possible OPs that can be used to change type but still be valid IR (maybe an Accera specific cast OP that takes an attr instead?)
+    if (innerLoopBodyIter != innerLoopBodyEnd && isa<
+                                                     // Ops that have CastOpInterface
+                                                     mlir::FPExtOp,
+                                                     mlir::FPToSIOp,
+                                                     mlir::FPToUIOp,
+                                                     mlir::FPTruncOp,
+                                                     mlir::IndexCastOp,
+                                                     mlir::SIToFPOp,
+                                                     mlir::UIToFPOp,
+                                                     mlir::UnrealizedConversionCastOp,
+
+                                                     // Ops that don't have CastOpInterface
+                                                     mlir::SignExtendIOp,
+                                                     mlir::TruncateIOp,
+                                                     mlir::ZeroExtendIOp>(
+                                                     innerLoopBodyIter))
+    {
+        castOp = &(*innerLoopBodyIter++);
+        opsToErase.push(castOp);
+    }
+
     if (innerLoopBodyIter == innerLoopBodyEnd || !isa<mlir::AffineLoadOp>(*innerLoopBodyIter))
     {
         return reportMatchFailure(affineForOp, "Failed to match the load from C Op");
@@ -6449,11 +6480,11 @@ LogicalResult TensorizeAffineForOpConversion::matchAndRewrite(AffineForOp affine
     auto loadCMap = loadCOp.getAffineMap();
     if (cRank < 2)
     {
-        return reportMatchFailure(loadCOp.getOperation(), "C array has rank < 2");
+        return reportMatchFailure(loadCOp, "C array has rank < 2");
     }
     if (cRank != loadCMap.getNumResults())
     {
-        return reportMatchFailure(loadCOp.getOperation(), "Failed to match the load from C Op");
+        return reportMatchFailure(loadCOp, "Failed to match the load from C Op");
     }
 
     // scan loadCOperands and note which ones are loop vars that refer to GPU block/thread IDs (or are affine expressions of them)
@@ -6463,12 +6494,12 @@ LogicalResult TensorizeAffineForOpConversion::matchAndRewrite(AffineForOp affine
 
     if (gpuDimsPerDimC.size() != 2)
     {
-        return reportMatchFailure(loadCOp.getOperation(), "Failed to match the load from C Op");
+        return reportMatchFailure(loadCOp, "Failed to match the load from C Op");
     }
 
     if (ContainsDim(gpuDimsPerDimC, GPUIndexDimension::Z))
     {
-        return reportMatchFailure(loadCOp.getOperation(), "Failed to match: C op uses GPU Z dimension");
+        return reportMatchFailure(loadCOp, "Failed to match: C op uses GPU Z dimension");
     }
 
     opsToErase.push(loadCOp);
@@ -6484,8 +6515,10 @@ LogicalResult TensorizeAffineForOpConversion::matchAndRewrite(AffineForOp affine
     {
         return reportMatchFailure(accumC, "Failed to match the accumulation op");
     }
+
     // Check that the operands for the addition op are in fact (A*B) and the load from C
-    if (!((accumC.lhs() == mulAB && accumC.rhs() == loadCOp) || (accumC.rhs() == mulAB && accumC.lhs() == loadCOp)))
+    auto mulABresolved = (castOp ? castOp : mulAB)->getResult(0);
+    if (!((accumC.lhs() == mulABresolved && accumC.rhs() == loadCOp) || (accumC.rhs() == mulABresolved && accumC.lhs() == loadCOp)))
     {
         return reportMatchFailure(accumC, "Failed to match the accumulation operands");
     }
diff --git a/accera/transforms/src/gpu/AcceraToGPUPass.cpp b/accera/transforms/src/gpu/AcceraToGPUPass.cpp
index 141abaef..891ae5c8 100644
--- a/accera/transforms/src/gpu/AcceraToGPUPass.cpp
+++ b/accera/transforms/src/gpu/AcceraToGPUPass.cpp
@@ -498,9 +498,7 @@ struct ValueMMALoadSyncOpToRocDLConversion final : public OpConversionPattern<vi
         const auto warpStride = warpSize / leadingDim;
 
         // Get per-thread computation handles
-        auto upperLeftCornerRowDim = rewriter.getAffineDimExpr(0);
-        auto upperLeftCornerColDim = rewriter.getAffineDimExpr(1);
-        auto inductionVarDim = rewriter.getAffineDimExpr(2);
+        auto inductionVarDim = rewriter.getAffineDimExpr(0);
         auto warpTidSym = rewriter.getAffineSymbolExpr(0);
 
         mlir::AffineExpr fullRowOffsetExpr, fullColOffsetExpr;
@@ -522,19 +520,49 @@ struct ValueMMALoadSyncOpToRocDLConversion final : public OpConversionPattern<vi
         }
 
         auto warpTidVal = utilir::GetCurrentGPUWarpThreadID(rewriter, loc);
-        std::vector<mlir::AffineExpr> intraTileOffsetExprs = { upperLeftCornerRowDim + fullRowOffsetExpr, upperLeftCornerColDim + fullColOffsetExpr };
-        auto intraTileOffsetMap = mlir::AffineMap::get(3, 1, intraTileOffsetExprs, rewriter.getContext());
-        auto mfmaExternalDimsMap = mlir::AffineMap::getMultiDimIdentityMap(externalIndices, rewriter.getContext());
-        auto fullMatrixAccessMap = utilir::ConcatenateAndShiftAffineDimsAndMaps(rewriter, mfmaExternalDimsMap, intraTileOffsetMap);
 
         auto loop = rewriter.replaceOpWithNewOp<mlir::AffineForOp>(op, 0, vecSize, 1, vec);
         auto loopBuilder = utilir::MakeBodyBuilder(loop);
         auto inductionVar = loop.getInductionVar();
         auto destVec = loop.getRegionIterArgs()[0];
 
+        // 1 dim for the induction var, 1 symbol for the warp thread ID
+        auto warpTileOffsetMap = mlir::AffineMap::get(1, 1, { fullRowOffsetExpr, fullColOffsetExpr }, rewriter.getContext());
+
+        // Shift the dims in warpTileOffsetMap by the mfma external dim count
+        auto mfmaExternalDimsMap = mlir::AffineMap::getMultiDimIdentityMap(externalIndices, loopBuilder.getContext());
+        warpTileOffsetMap = utilir::ConcatenateAndShiftAffineDimsAndMaps(loopBuilder, mfmaExternalDimsMap, warpTileOffsetMap);
+
+        // The buffer being accessed may hold the tile data in a physical order different from the logical order,
+        // so use the tileAccessMap map attr if it exists to map how offsets within the logical tile translate
+        // to offsets within the physical buffer layout
+        auto tileAccessMapOpt = op.tileAccessMap();
+        if (tileAccessMapOpt.hasValue())
+        {
+            auto tileAccessMap = tileAccessMapOpt.getValue();
+            warpTileOffsetMap = tileAccessMap.compose(warpTileOffsetMap);
+        }
+
+        mlir::Value zeroIndex = rewriter.create<mlir::ConstantIndexOp>(loc, 0);
+        std::vector<mlir::Value> warpTileOffsetOperands(upperLeftCornerPos.size(), zeroIndex);
+        warpTileOffsetOperands[warpTileOffsetOperands.size() - 2] = inductionVar;
+        warpTileOffsetOperands[warpTileOffsetOperands.size() - 1] = warpTidVal;
+        auto warpTileOffsetPhysicalPos = utilir::MultiDimAffineApply(loopBuilder, loc, warpTileOffsetMap, warpTileOffsetOperands);
+
+        // Now that we have the offsets within the physical buffer for this thread in the warp, add that to the physical upper left corner position
+        // To get the full position in the memref
+
+        std::vector<mlir::AffineExpr> matrixAccessExprs;
+        for (size_t i = 0; i < upperLeftCornerPos.size(); ++i)
+        {
+            matrixAccessExprs.emplace_back(loopBuilder.getAffineDimExpr(i) + loopBuilder.getAffineDimExpr(i + upperLeftCornerPos.size()));
+        }
+
+        auto fullMatrixAccessMap = mlir::AffineMap::get(2 * upperLeftCornerPos.size(), 0, matrixAccessExprs, loopBuilder.getContext());
+
         std::vector<mlir::Value> accessOperands(upperLeftCornerPos);
-        accessOperands.push_back(inductionVar);
-        accessOperands.push_back(warpTidVal);
+        accessOperands.insert(accessOperands.end(), warpTileOffsetPhysicalPos.begin(), warpTileOffsetPhysicalPos.end());
+
         auto accessPos = utilir::MultiDimAffineApply(loopBuilder, loc, fullMatrixAccessMap, accessOperands);
 
         auto load = loopBuilder.create<memref::LoadOp>(loc, memref, accessPos);
diff --git a/accera/utilities/include/TypeTraits.h b/accera/utilities/include/TypeTraits.h
index 17c9c888..d963a28c 100644
--- a/accera/utilities/include/TypeTraits.h
+++ b/accera/utilities/include/TypeTraits.h
@@ -6,6 +6,7 @@
 
 #pragma once
 
+#include <cstddef>
 #include <type_traits>
 #include <vector>
 
diff --git a/accera/value/include/EmitterContext.h b/accera/value/include/EmitterContext.h
index d8838851..3e91e869 100644
--- a/accera/value/include/EmitterContext.h
+++ b/accera/value/include/EmitterContext.h
@@ -222,6 +222,9 @@ namespace value
         /// <returns> An instance of Value that contains a reference to the allocated memory </returns>
         Value StoreConstantData(ConstantData data, MemoryLayout layout, const std::string& name);
 
+        /// <summary> Returns true or false depending on whether the data stored is known to be a constant </summary>
+        bool IsConstantData(Value) const;
+
         /// <summary> Makes a reference to a constant data source. This data source may originate from another context </summary>
         /// <param name="constantDataSource"> The constant data source that will be referenced from this context </param>
         /// <returns> An instance of Value that contains a reference to the source Value </returns>
@@ -349,8 +352,6 @@ namespace value
 
         Scalar Cast(Scalar value, ValueType type);
 
-        Scalar UnsignedCast(Scalar value, ValueType type);
-
         Scalar Bitcast(Scalar value, ValueType type);
 
         IfContext If(Scalar test, std::function<void()> fn);
@@ -444,6 +445,7 @@ namespace value
         virtual bool IsFunctionDefinedImpl(FunctionDeclaration decl) const = 0;
 
         virtual Value StoreConstantDataImpl(ConstantData data, MemoryLayout layout, const std::string& name) = 0;
+        virtual bool IsConstantDataImpl(Value) const = 0;
         virtual Value ResolveConstantDataReferenceImpl(Value constantDataSource) = 0;
 
         virtual void ForImpl(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn, const std::string& name) = 0;
@@ -481,8 +483,6 @@ namespace value
 
         virtual Scalar CastImpl(Scalar value, ValueType type) = 0;
 
-        virtual Scalar UnsignedCastImpl(Scalar value, ValueType type) = 0;
-
         virtual Scalar BitcastImpl(Scalar value, ValueType type) = 0;
 
         virtual IfContext IfImpl(Scalar test, std::function<void()> fn) = 0;
diff --git a/accera/value/include/MLIREmitterContext.h b/accera/value/include/MLIREmitterContext.h
index 273dd2fa..f3f68b16 100644
--- a/accera/value/include/MLIREmitterContext.h
+++ b/accera/value/include/MLIREmitterContext.h
@@ -135,6 +135,7 @@ namespace value
         bool IsFunctionDefinedImpl(FunctionDeclaration decl) const override;
 
         Value StoreConstantDataImpl(ConstantData data, MemoryLayout layout, const std::string& name) override;
+        bool IsConstantDataImpl(Value) const override;
         Value ResolveConstantDataReferenceImpl(Value constantDataSource) override;
 
         void ForImpl(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn, const std::string& name) override;
@@ -171,11 +172,8 @@ namespace value
         void MMAStoreSyncImpl(const MatrixFragment& source, Matrix& target, const int64_t rowOffset, const int64_t colOffset) override;
         Value MMAComputeSyncImpl(const MatrixFragment& A, const MatrixFragment& B, const MatrixFragment& C, uint32_t cbsz, uint32_t abid, uint32_t blgp) override;
 
-        Scalar CastImpl(Scalar value, ValueType type, bool doSignedCast);
         Scalar CastImpl(Scalar value, ValueType type) override;
 
-        Scalar UnsignedCastImpl(Scalar value, ValueType type) override;
-
         Scalar BitcastImpl(Scalar value, ValueType type) override;
 
         IfContext IfImpl(Scalar test, std::function<void()> fn) override;
diff --git a/accera/value/include/Scalar.h b/accera/value/include/Scalar.h
index 565488fa..ac9b8fd6 100644
--- a/accera/value/include/Scalar.h
+++ b/accera/value/include/Scalar.h
@@ -76,6 +76,10 @@ namespace value
         /// <returns> The type </returns>
         ValueType GetType() const;
 
+        // Returns [s1, s2] with either s1 casted to s2's type or s2 casted to s1's type or unchanged if
+        // there is no implicit type conversion that can be done.
+        static std::pair<Scalar, Scalar> MakeTypeCompatible(Scalar s1, Scalar s2);
+
     private:
         friend Scalar operator+(Scalar, Scalar);
         friend Scalar operator*(Scalar, Scalar);
@@ -110,7 +114,6 @@ namespace value
     {
         return MakeScalar(GetValueType<T>(), name);
     }
-
 } // namespace value
 } // namespace accera
 
diff --git a/accera/value/include/Value.h b/accera/value/include/Value.h
index 447889f9..07fbe5c0 100644
--- a/accera/value/include/Value.h
+++ b/accera/value/include/Value.h
@@ -27,6 +27,12 @@ namespace accera
 {
 namespace value
 {
+    class TypeMismatchException : public utilities::GenericException
+    {
+    public:
+        TypeMismatchException(ValueType expected, ValueType actual) :
+            GenericException(ToString(actual) + " type is incompatible with " + ToString(expected)) {}
+    };
 
     class Value;
     class Scalar;
@@ -305,7 +311,7 @@ namespace value
         /// <summary> Returns true if the instance is undefined </summary>
         bool IsUndefined() const;
 
-        /// <summary> Returns true if the instance holds data </summary>
+        /// <summary> Returns true if the instance does not hold data </summary>
         bool IsEmpty() const;
 
         /// <summary> Returns true if the instance holds constant data </summary>
@@ -375,6 +381,9 @@ namespace value
         /// <summary> Clear the data, if any, on this instance </summary>
         void ClearData();
 
+        /// <summary> Resets this instance </summary>
+        void Clear();
+
         /// <summary> Returns the number of pointer indirections on the data referred to by this instance </summary>
         /// <returns> The number of pointer indirections </returns>
         int PointerLevel() const;
@@ -495,6 +504,8 @@ namespace value
         return Cast(value, GetValueType<T>());
     }
 
+    bool IsImplicitlyCastable(ViewAdapter v1, ViewAdapter v2);
+
 } // namespace value
 } // namespace accera
 
diff --git a/accera/value/include/ValueType.h b/accera/value/include/ValueType.h
index 3a4a4320..ccbbbff9 100644
--- a/accera/value/include/ValueType.h
+++ b/accera/value/include/ValueType.h
@@ -171,6 +171,61 @@ namespace value
         }
     }
 
+    constexpr inline bool IsSignedType(ValueType t)
+    {
+        switch (t)
+        {
+        case ValueType::BFloat16:
+            [[fallthrough]];
+        case ValueType::Float16:
+            [[fallthrough]];
+        case ValueType::Float:
+            [[fallthrough]];
+        case ValueType::Double:
+            [[fallthrough]];
+        case ValueType::Int8:
+            [[fallthrough]];
+        case ValueType::Int16:
+            [[fallthrough]];
+        case ValueType::Int32:
+            [[fallthrough]];
+        case ValueType::Int64:
+            return true;
+        default:
+            return false;
+        }
+    }
+
+    constexpr inline bool IsUnsignedType(ValueType t)
+    {
+        switch (t)
+        {
+        case ValueType::Boolean:
+            [[fallthrough]];
+        case ValueType::Byte:
+            [[fallthrough]];
+        case ValueType::Uint16:
+            [[fallthrough]];
+        case ValueType::Uint32:
+            [[fallthrough]];
+        case ValueType::Uint64:
+            return true;
+        default:
+            return false;
+        }
+    }
+
+    constexpr inline bool IsSignlessType(ValueType t)
+    {
+        switch (t)
+        {
+        case ValueType::Index:
+            return true;
+        default:
+            return false;
+        }
+    }
+
     /// <summary> Get a string representation of the enum value </summary>
     std::string ToString(ValueType t);
     ValueType FromString(std::string name);
diff --git a/accera/value/src/EmitterContext.cpp b/accera/value/src/EmitterContext.cpp
index 874035cb..3729166e 100644
--- a/accera/value/src/EmitterContext.cpp
+++ b/accera/value/src/EmitterContext.cpp
@@ -139,6 +139,15 @@ namespace value
 
     Value EmitterContext::StoreConstantData(ConstantData data, MemoryLayout layout, const std::string& name) { return StoreConstantDataImpl(data, layout, name); }
 
+    bool EmitterContext::IsConstantData(Value v) const
+    {
+        if (!v.IsDefined() || v.IsEmpty()) return false;
+
+        if (!std::holds_alternative<Emittable>(v.GetUnderlyingData())) return true;
+
+        return IsConstantDataImpl(v);
+    }
+
     Value EmitterContext::ResolveConstantDataReference(Value source) { return ResolveConstantDataReferenceImpl(source); }
 
     void EmitterContext::For(MemoryLayout layout, std::function<void(std::vector<Scalar>)> fn, const std::string& name)
@@ -251,11 +260,6 @@ namespace value
         return CastImpl(value, type);
     }
 
-    Scalar EmitterContext::UnsignedCast(Scalar value, ValueType type)
-    {
-        return UnsignedCastImpl(value, type);
-    }
-
     Scalar EmitterContext::Bitcast(Scalar value, ValueType type)
     {
         return BitcastImpl(value, type);
diff --git a/accera/value/src/MLIREmitterContext.cpp b/accera/value/src/MLIREmitterContext.cpp
index ab5dd3ac..ba9468cf 100644
--- a/accera/value/src/MLIREmitterContext.cpp
+++ b/accera/value/src/MLIREmitterContext.cpp
@@ -6,6 +6,7 @@
 
 #include "MLIREmitterContext.h"
 #include "CompilerOptions.h"
+#include "ValueType.h"
 
 #include <ir/include/DialectRegistry.h>
 #include <ir/include/IRUtil.h>
@@ -17,6 +18,7 @@
 #include <ir/include/value/ValueAttributes.h>
 #include <ir/include/value/ValueFuncOp.h>
 
+#include <llvm/Support/Casting.h>
 #include <transforms/include/value/ValueToStandardLoweringPass.h>
 
 #include <utilities/include/Exception.h>
@@ -302,7 +304,7 @@ mlir::FunctionType ToMLIRType(mlir::OpBuilder& builder, const FunctionDeclaratio
 [[nodiscard]] mlir::Value ToMLIRValue(mlir::OpBuilder& builder, const ViewAdapter& view)
 {
     auto value = view.GetValue();
-    if (value.IsEmpty() || value.IsUndefined() || value.IsConstant())
+    if (value.IsEmpty() || value.IsUndefined())
     {
         return {};
     }
@@ -1449,6 +1451,19 @@ Value MLIRContext::StoreConstantDataImpl(ConstantData data, MemoryLayout layout,
     return Value(emittable, layout);
 }
 
+bool MLIRContext::IsConstantDataImpl(Value v) const
+{
+    auto data = Unwrap(v);
+
+    // TODO: Extend this check to handle constant data arrays. Right now, this only works for scalar values
+    if (llvm::isa_and_nonnull<mlir::ConstantIntOp, mlir::ConstantIndexOp, mlir::ConstantFloatOp>(data.getDefiningOp()))
+    {
+        return true;
+    }
+
+    return false;
+}
+
 Value MLIRContext::ResolveConstantDataReferenceImpl(Value constantDataSource)
 {
     auto sourceRefGlobalOp = mlir::Value::getFromOpaquePointer(constantDataSource.Get<Emittable>().GetDataAs<EmittableInfo*>()->data).getDefiningOp();
@@ -2013,7 +2028,7 @@ Value MLIRContext::MMAComputeSyncImpl(const MatrixFragment& A, const MatrixFragm
     return Value(emittable, C.GetValue().GetLayout());
 }
 
-Scalar MLIRContext::CastImpl(Scalar value, ValueType type, bool doSignedCast)
+Scalar MLIRContext::CastImpl(Scalar value, ValueType type)
 {
     auto& builder = _impl->builder;
     mlir::Value mlirValue = ResolveMLIRScalar(builder, ToMLIRValue(builder, value));
@@ -2027,6 +2042,8 @@ Scalar MLIRContext::CastImpl(Scalar value, ValueType type, bool doSignedCast)
         return Wrap(mlirValue);
     }
 
+    auto doSignedCast = IsSignedType(type);
+
     return mlir::TypeSwitch<mlir::Type, Scalar>(fromType)
         .Case([&](mlir::IntegerType fromIntType) {
             auto signlessMlirValue = accera::ir::util::ToSignlessMLIRValue(builder, mlirValue);
@@ -2034,22 +2051,31 @@ Scalar MLIRContext::CastImpl(Scalar value, ValueType type, bool doSignedCast)
             return mlir::TypeSwitch<mlir::Type, Scalar>(toType)
                 .Case([&](mlir::IntegerType toIntType) {
                     auto toIntTypeSignless = accera::ir::util::ToSignlessMLIRType(builder, toIntType);
+
+                    mlir::Value casted;
                     if (fromIntType.getWidth() == toIntType.getWidth())
                     {
-                        return Wrap(signlessMlirValue);
+                        // do nothing
                     }
                     else if (fromIntType.getWidth() > toIntType.getWidth())
                     {
-                        return Wrap(builder.create<mlir::TruncateIOp>(loc, signlessMlirValue, toIntTypeSignless));
-                    }
-                    else if (doSignedCast || !fromIntType.isUnsigned())
-                    {
-                        return Wrap(builder.create<mlir::SignExtendIOp>(loc, signlessMlirValue, toIntTypeSignless));
+
+                        signlessMlirValue = builder.create<mlir::TruncateIOp>(loc, signlessMlirValue, toIntTypeSignless);
                     }
                     else
                     {
-                        return Wrap(builder.create<mlir::ZeroExtendIOp>(loc, signlessMlirValue, toIntTypeSignless));
+                        if (doSignedCast)
+                        {
+                            signlessMlirValue = builder.create<mlir::SignExtendIOp>(loc, signlessMlirValue, toIntTypeSignless);
+                        }
+                        else
+                        {
+                            signlessMlirValue = builder.create<mlir::ZeroExtendIOp>(loc, signlessMlirValue, toIntTypeSignless);
+                        }
                     }
+
+                    casted = builder.create<mlir::UnrealizedConversionCastOp>(signlessMlirValue.getLoc(), toType, signlessMlirValue)->getResult(0);
+                    return Wrap(casted);
                 })
                 .Case([&](mlir::IndexType) {
                     return Wrap(builder.create<mlir::IndexCastOp>(loc, signlessMlirValue, toType));
@@ -2057,10 +2083,8 @@ Scalar MLIRContext::CastImpl(Scalar value, ValueType type, bool doSignedCast)
                 .Case([&](mlir::FloatType) {
                     return fromIntType.isUnsigned() ? Wrap(builder.create<mlir::UIToFPOp>(loc, signlessMlirValue, toType)) : Wrap(builder.create<mlir::SIToFPOp>(loc, signlessMlirValue, toType));
                 })
-                .Default([&](mlir::Type) {
+                .Default([&](mlir::Type) -> Scalar {
                     throw utilities::LogicException(utilities::LogicExceptionErrors::notImplemented, __FILE__ " : " + std::to_string(__LINE__));
-                    llvm_unreachable("unexpected");
-                    return Scalar();
                 });
         })
         .Case([&](mlir::IndexType) {
@@ -2073,10 +2097,8 @@ Scalar MLIRContext::CastImpl(Scalar value, ValueType type, bool doSignedCast)
                     auto int64Value = builder.create<mlir::IndexCastOp>(loc, mlirValue, builder.getI64Type()); // index->int64
                     return Wrap(builder.create<mlir::SIToFPOp>(loc, int64Value, toType)); // int64->fp
                 })
-                .Default([&](mlir::Type) {
+                .Default([&](mlir::Type) -> Scalar {
                     throw utilities::LogicException(utilities::LogicExceptionErrors::notImplemented, __FILE__ " : " + std::to_string(__LINE__));
-                    llvm_unreachable("unexpected");
-                    return Scalar();
                 });
         })
         .Case([&](mlir::FloatType fromFloatType) {
@@ -2088,29 +2110,15 @@ Scalar MLIRContext::CastImpl(Scalar value, ValueType type, bool doSignedCast)
                 .Case([&](mlir::FloatType toFloatType) {
                     return fromFloatType.getWidth() > toFloatType.getWidth() ? Wrap(builder.create<mlir::FPTruncOp>(loc, mlirValue, toType)) : Wrap(builder.create<mlir::FPExtOp>(loc, mlirValue, toType));
                 })
-                .Default([&](mlir::Type) {
+                .Default([&](mlir::Type) -> Scalar {
                     throw utilities::LogicException(utilities::LogicExceptionErrors::notImplemented, __FILE__ " : " + std::to_string(__LINE__));
-                    llvm_unreachable("unexpected");
-                    return Scalar();
                 });
         })
-        .Default([&](mlir::Type) {
+        .Default([&](mlir::Type) -> Scalar {
             throw utilities::LogicException(utilities::LogicExceptionErrors::notImplemented, __FILE__ " : " + std::to_string(__LINE__));
-            llvm_unreachable("unexpected");
-            return Scalar();
         });
 }
 
-Scalar MLIRContext::CastImpl(Scalar value, ValueType type)
-{
-    return CastImpl(value, type, /*doSignedCast=*/true);
-}
-
-Scalar MLIRContext::UnsignedCastImpl(Scalar value, ValueType type)
-{
-    return CastImpl(value, type, /*doSignedCast=*/false);
-}
-
 Scalar MLIRContext::BitcastImpl(Scalar value, ValueType type)
 {
     auto& builder = _impl->builder;
diff --git a/accera/value/src/Scalar.cpp b/accera/value/src/Scalar.cpp
index 5d8803a7..737eb646 100644
--- a/accera/value/src/Scalar.cpp
+++ b/accera/value/src/Scalar.cpp
@@ -5,7 +5,10 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 #include "Scalar.h"
+#include "Emittable.h"
 #include "EmitterContext.h"
+#include "ScalarOperations.h"
+#include "Value.h"
 #include "ValueType.h"
 
 #include <initializer_list>
@@ -17,42 +20,6 @@ namespace accera
 {
 namespace value
 {
-    namespace
-    {
-        template <typename T, typename C>
-        constexpr bool ItemIsOneOf(T&& t, C&& c)
-        {
-            return llvm::any_of(c, [=](auto arg) { return t == arg; });
-        }
-
-        bool IsImplcitTypeCastable(ValueType source, ValueType target)
-        {
-#define MAP_TARGET_TO_POSSIBLE_SOURCES(TARGET, ...) \
-    case TARGET:                                    \
-        return ItemIsOneOf(source, std::initializer_list<ValueType>{ __VA_ARGS__ })
-
-            switch (target)
-            {
-                MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Int8, ValueType::Boolean);
-                MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Byte, ValueType::Boolean, ValueType::Int8);
-                MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Int16, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Uint16);
-                MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Uint16, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16);
-                MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Int32, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Uint32);
-                MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Uint32, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32);
-                MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Int64, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32, ValueType::Uint32, ValueType::Uint64);
-                MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Uint64, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32, ValueType::Uint32, ValueType::Int64);
-                MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Float16, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16);
-                MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::BFloat16, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16);
-                MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Float, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32, ValueType::Uint32, ValueType::Int64, ValueType::Uint64, ValueType::Float16, ValueType::BFloat16);
-                MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Double, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32, ValueType::Uint32, ValueType::Int64, ValueType::Uint64, ValueType::Float16, ValueType::BFloat16, ValueType::Float);
-
-            default:
-                return false;
-            }
-
-#undef MAP_TARGET_TO_POSSIBLE_SOURCES
-        }
-    } // namespace
     using namespace utilities;
 
     Scalar::Scalar() = default;
@@ -81,10 +48,16 @@ namespace value
     {
         if (this != &other)
         {
-            if (GetType() != other.GetType() && IsImplcitTypeCastable(other.GetType(), GetType()))
+            auto e1 = _value.TryGet<Emittable>();
+            auto e2 = other._value.TryGet<Emittable>();
+            if (e1 && e2 && e1->GetDataAs<void*>() == e2->GetDataAs<void*>())
+            {
+                return *this;
+            }
+            if (GetType() != other.GetType() && IsImplicitlyCastable(other, *this))
             {
                 Scalar castedScalar = Cast(other, GetType());
-                _value = castedScalar._value;
+                GetContext().CopyData(castedScalar._value, _value);
             }
             else
             {
@@ -98,21 +71,30 @@ namespace value
     {
         if (this != &other)
         {
-            if (GetType() != other.GetType() && IsImplcitTypeCastable(other.GetType(), GetType()))
+            auto e1 = _value.TryGet<Emittable>();
+            auto e2 = other._value.TryGet<Emittable>();
+            if (e1 && e2 && e1->GetDataAs<void*>() == e2->GetDataAs<void*>())
+            {
+                return *this;
+            }
+            if (GetType() != other.GetType() && IsImplicitlyCastable(other, *this))
             {
                 Scalar castedScalar = Cast(other, GetType());
-                _value = std::move(castedScalar._value);
+                GetContext().MoveData(castedScalar._value, _value);
             }
             else
             {
                 _value = std::move(other._value);
             }
-            other._value = Value();
+            other._value.Clear();
         }
         return *this;
     }
 
-    Value Scalar::GetValue() const { return _value; }
+    Value Scalar::GetValue() const
+    {
+        return _value;
+    }
 
     Scalar Scalar::Copy() const
     {
@@ -121,42 +103,185 @@ namespace value
         return s;
     }
 
-    ValueType Scalar::GetType() const { return _value.GetBaseType(); }
+    ValueType Scalar::GetType() const
+    {
+        return _value.GetBaseType();
+    }
 
-    void Scalar::SetName(const std::string& name) { _value.SetName(name); }
+    void Scalar::SetName(const std::string& name)
+    {
+        _value.SetName(name);
+    }
 
-    std::string Scalar::GetName() const { return _value.GetName(); }
+    std::string Scalar::GetName() const
+    {
+        return _value.GetName();
+    }
 
     Scalar& Scalar::operator+=(Scalar s)
     {
-        _value = GetContext().BinaryOperation(ValueBinaryOperation::add, _value, s._value);
+        Scalar rhs;
+        if (s.GetType() != GetType())
+        {
+            if (IsImplicitlyCastable(s, *this))
+            {
+                rhs = Cast(s, GetType());
+            }
+            else
+            {
+                throw TypeMismatchException(GetType(), s.GetType());
+            }
+        }
+        else
+        {
+            rhs = s;
+        }
+
+        _value = GetContext().BinaryOperation(ValueBinaryOperation::add, _value, rhs._value);
         return *this;
     }
 
     Scalar& Scalar::operator-=(Scalar s)
     {
-        _value = GetContext().BinaryOperation(ValueBinaryOperation::subtract, _value, s._value);
+        Scalar rhs;
+        if (s.GetType() != GetType())
+        {
+            if (IsImplicitlyCastable(s, *this))
+            {
+                rhs = Cast(s, GetType());
+            }
+            else
+            {
+                throw TypeMismatchException(GetType(), s.GetType());
+            }
+        }
+        else
+        {
+            rhs = s;
+        }
+
+        _value = GetContext().BinaryOperation(ValueBinaryOperation::subtract, _value, rhs._value);
+        // auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(*this, s);
+
+        // _value = GetContext().BinaryOperation(ValueBinaryOperation::subtract, lhs._value, rhs._value);
         return *this;
     }
 
     Scalar& Scalar::operator*=(Scalar s)
     {
-        _value = GetContext().BinaryOperation(ValueBinaryOperation::multiply, _value, s._value);
+        Scalar rhs;
+        if (s.GetType() != GetType())
+        {
+            if (IsImplicitlyCastable(s, *this))
+            {
+                rhs = Cast(s, GetType());
+            }
+            else
+            {
+                throw TypeMismatchException(GetType(), s.GetType());
+            }
+        }
+        else
+        {
+            rhs = s;
+        }
+
+        _value = GetContext().BinaryOperation(ValueBinaryOperation::multiply, _value, rhs._value);
+        // auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(*this, s);
+
+        // _value = GetContext().BinaryOperation(ValueBinaryOperation::multiply, lhs._value, rhs._value);
         return *this;
     }
 
     Scalar& Scalar::operator/=(Scalar s)
     {
-        _value = GetContext().BinaryOperation(ValueBinaryOperation::divide, _value, s._value);
+        Scalar rhs;
+        if (s.GetType() != GetType())
+        {
+            if (IsImplicitlyCastable(s, *this))
+            {
+                rhs = Cast(s, GetType());
+            }
+            else
+            {
+                throw TypeMismatchException(GetType(), s.GetType());
+            }
+        }
+        else
+        {
+            rhs = s;
+        }
+
+        _value = GetContext().BinaryOperation(ValueBinaryOperation::divide, _value, rhs._value);
+        // auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(*this, s);
+
+        // _value = GetContext().BinaryOperation(ValueBinaryOperation::divide, lhs._value, rhs._value);
         return *this;
     }
 
     Scalar& Scalar::operator%=(Scalar s)
     {
-        _value = GetContext().BinaryOperation(ValueBinaryOperation::modulus, _value, s._value);
+        Scalar rhs;
+        if (s.GetType() != GetType())
+        {
+            if (IsImplicitlyCastable(s, *this))
+            {
+                rhs = Cast(s, GetType());
+            }
+            else
+            {
+                throw TypeMismatchException(GetType(), s.GetType());
+            }
+        }
+        else
+        {
+            rhs = s;
+        }
+
+        _value = GetContext().BinaryOperation(ValueBinaryOperation::modulus, _value, rhs._value);
+        // auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(*this, s);
+
+        // _value = GetContext().BinaryOperation(ValueBinaryOperation::modulus, lhs._value, rhs._value);
         return *this;
     }
 
+    std::pair<Scalar, Scalar> Scalar::MakeTypeCompatible(Scalar s1, Scalar s2)
+    {
+        auto s1Type = s1.GetType();
+        auto s2Type = s2.GetType();
+
+        if (s1Type == s2Type)
+        {
+            return { s1, s2 };
+        }
+
+        assert((!s1.IsConstant() || !s2.IsConstant()) && "Unexpected scenario");
+
+        Scalar newS1, newS2;
+        if (s1.IsConstant())
+        {
+            newS1 = Cast(s1, s2Type);
+            newS2 = s2;
+        }
+        else if (s2.IsConstant())
+        {
+            newS1 = s1;
+            newS2 = Cast(s2, s1Type);
+        }
+        else if (IsImplicitlyCastable(s1, s2))
+        {
+            newS1 = Cast(s1, s2Type);
+            newS2 = s2;
+        }
+        else if (IsImplicitlyCastable(s2, s1))
+        {
+            newS1 = s1;
+            newS2 = Cast(s2, s1Type);
+        }
+
+        return { newS1, newS2 };
+    }
+
     // Free function operator overloads
     Scalar operator+(Scalar s1, Scalar s2)
     {
@@ -234,32 +359,44 @@ namespace value
 
     Scalar operator==(Scalar s1, Scalar s2)
     {
-        return GetContext().LogicalOperation(ValueLogicalOperation::equality, s1.GetValue(), s2.GetValue());
+        auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(s1, s2);
+
+        return GetContext().LogicalOperation(ValueLogicalOperation::equality, lhs.GetValue(), rhs.GetValue());
     }
 
     Scalar operator!=(Scalar s1, Scalar s2)
     {
-        return GetContext().LogicalOperation(ValueLogicalOperation::inequality, s1.GetValue(), s2.GetValue());
+        auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(s1, s2);
+
+        return GetContext().LogicalOperation(ValueLogicalOperation::inequality, lhs.GetValue(), rhs.GetValue());
     }
 
     Scalar operator<=(Scalar s1, Scalar s2)
     {
-        return GetContext().LogicalOperation(ValueLogicalOperation::lessthanorequal, s1.GetValue(), s2.GetValue());
+        auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(s1, s2);
+
+        return GetContext().LogicalOperation(ValueLogicalOperation::lessthanorequal, lhs.GetValue(), rhs.GetValue());
     }
 
     Scalar operator<(Scalar s1, Scalar s2)
     {
-        return GetContext().LogicalOperation(ValueLogicalOperation::lessthan, s1.GetValue(), s2.GetValue());
+        auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(s1, s2);
+
+        return GetContext().LogicalOperation(ValueLogicalOperation::lessthan, lhs.GetValue(), rhs.GetValue());
     }
 
     Scalar operator>=(Scalar s1, Scalar s2)
     {
-        return GetContext().LogicalOperation(ValueLogicalOperation::greaterthanorequal, s1.GetValue(), s2.GetValue());
+        auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(s1, s2);
+
+        return GetContext().LogicalOperation(ValueLogicalOperation::greaterthanorequal, lhs.GetValue(), rhs.GetValue());
     }
 
     Scalar operator>(Scalar s1, Scalar s2)
     {
-        return GetContext().LogicalOperation(ValueLogicalOperation::greaterthan, s1.GetValue(), s2.GetValue());
+        auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(s1, s2);
+
+        return GetContext().LogicalOperation(ValueLogicalOperation::greaterthan, lhs.GetValue(), rhs.GetValue());
     }
 
     Scalar operator&&(Scalar s1, Scalar s2)
@@ -268,8 +405,9 @@ namespace value
         {
             throw LogicException(LogicExceptionErrors::illegalState);
         }
+        auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(s1, s2);
 
-        return GetContext().BinaryOperation(ValueBinaryOperation::logicalAnd, s1.GetValue(), s2.GetValue());
+        return GetContext().BinaryOperation(ValueBinaryOperation::logicalAnd, lhs.GetValue(), rhs.GetValue());
     }
 
     Scalar operator||(Scalar s1, Scalar s2)
@@ -278,8 +416,9 @@ namespace value
         {
             throw LogicException(LogicExceptionErrors::illegalState);
         }
+        auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(s1, s2);
 
-        return GetContext().BinaryOperation(ValueBinaryOperation::logicalOr, s1.GetValue(), s2.GetValue());
+        return GetContext().BinaryOperation(ValueBinaryOperation::logicalOr, lhs.GetValue(), rhs.GetValue());
     }
 
     Scalar MakeScalar(ValueType type, const std::string&)
diff --git a/accera/value/src/ScalarOperations.cpp b/accera/value/src/ScalarOperations.cpp
index 5babd398..6506974a 100644
--- a/accera/value/src/ScalarOperations.cpp
+++ b/accera/value/src/ScalarOperations.cpp
@@ -13,6 +13,7 @@
 #include <mlir/Dialect/Math/IR/Math.h>
 #include <mlir/Dialect/StandardOps/IR/Ops.h>
 #include <stdexcept>
+#include <tuple>
 
 namespace accera
 {
@@ -96,31 +97,36 @@ namespace value
     Scalar Add(Scalar s1, Scalar s2)
     {
         Scalar copy = s1.Copy();
-        return copy += s2;
+        auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(copy, s2);
+        return lhs += rhs;
     }
 
     Scalar Subtract(Scalar s1, Scalar s2)
     {
         Scalar copy = s1.Copy();
-        return copy -= s2;
+        auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(copy, s2);
+        return lhs -= rhs;
     }
 
     Scalar Multiply(Scalar s1, Scalar s2)
     {
         Scalar copy = s1.Copy();
-        return copy *= s2;
+        auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(copy, s2);
+        return lhs *= rhs;
     }
 
     Scalar Divide(Scalar s1, Scalar s2)
     {
         Scalar copy = s1.Copy();
-        return copy /= s2;
+        auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(copy, s2);
+        return lhs /= rhs;
     }
 
     Scalar Modulo(Scalar s1, Scalar s2)
     {
         Scalar copy = s1.Copy();
-        return copy %= s2;
+        auto&& [lhs, rhs] = Scalar::MakeTypeCompatible(copy, s2);
+        return lhs %= rhs;
     }
 
     Scalar FusedMultiplyAdd(Scalar a, Scalar b, Scalar c)
@@ -194,16 +200,24 @@ namespace value
 
     Scalar Max(Scalar s1, Scalar s2)
     {
+        std::tie(s1, s2) = Scalar::MakeTypeCompatible(s1, s2);
+        
         return Select(s1 > s2, s1, s2);
     }
 
     Scalar Min(Scalar s1, Scalar s2)
     {
+        std::tie(s1, s2) = Scalar::MakeTypeCompatible(s1, s2);
+
         return Select(s1 < s2, s1, s2);
     }
 
     Scalar Clamp(Scalar s, Scalar min, Scalar max)
     {
+        std::tie(min, max) = Scalar::MakeTypeCompatible(min, max);
+        std::tie(s, min) = Scalar::MakeTypeCompatible(s, min);
+        std::tie(s, max) = Scalar::MakeTypeCompatible(s, max);
+
         return Min(max, Max(s, min));
     }
 
@@ -220,6 +234,7 @@ namespace value
 
     Scalar Select(Scalar cmp, Scalar a, Scalar b)
     {
+        std::tie(a, b) = Scalar::MakeTypeCompatible(a, b);
         return ScalarOpBuilder<mlir::SelectOp>(cmp, a, b);
     }
 
diff --git a/accera/value/src/Value.cpp b/accera/value/src/Value.cpp
index 498a537f..ba094216 100644
--- a/accera/value/src/Value.cpp
+++ b/accera/value/src/Value.cpp
@@ -6,6 +6,7 @@
 
 #include "Value.h"
 #include "EmitterContext.h"
+#include "ValueType.h"
 
 #include <utilities/include/Hash.h>
 #include <utilities/include/TypeAliases.h>
@@ -177,7 +178,6 @@ namespace value
                     {
                         _type = other._type;
                     }
-
                     GetContext().MoveData(other, *this);
                 }
             }
@@ -213,7 +213,7 @@ namespace value
 
     void Value::SetData(Value value, bool force)
     {
-        if (!force && value.IsConstrained() && value.GetLayout() != GetLayout())
+        if (!force && IsConstrained() && value.IsConstrained() && value.GetLayout() != GetLayout())
         {
             throw InputException(InputExceptionErrors::invalidArgument);
         }
@@ -226,10 +226,7 @@ namespace value
                                       }
 
                                       _data = emittable;
-                                      if (!force)
-                                      {
-                                          _type = type;
-                                      }
+                                      _type = type;
                                   },
                                    [this, force](auto&& arg) {
                                        if (!force && GetValueType<std::decay_t<decltype(arg)>>() != _type.first)
@@ -242,9 +239,15 @@ namespace value
                    value._data);
     }
 
-    bool Value::IsDefined() const { return _type.first != ValueType::Undefined; }
+    bool Value::IsDefined() const
+    {
+        return _type.first != ValueType::Undefined;
+    }
 
-    bool Value::IsUndefined() const { return !IsDefined(); }
+    bool Value::IsUndefined() const
+    {
+        return !IsDefined();
+    }
 
     bool Value::IsEmpty() const
     {
@@ -255,7 +258,10 @@ namespace value
             _data);
     }
 
-    bool Value::IsConstant() const { return !std::holds_alternative<Emittable>(_data); }
+    bool Value::IsConstant() const
+    {
+        return GetContext().IsConstantData(*this);
+    }
 
     bool Value::IsIntegral() const
     {
@@ -286,54 +292,128 @@ namespace value
         }
     }
 
-    bool Value::IsBoolean() const { return _type.first == ValueType::Boolean; }
+    bool Value::IsBoolean() const
+    {
+        return _type.first == ValueType::Boolean;
+    }
 
-    bool Value::IsByte() const { return _type.first == ValueType::Byte; }
+    bool Value::IsByte() const
+    {
+        return _type.first == ValueType::Byte;
+    }
 
-    bool Value::IsInt8() const { return _type.first == ValueType::Int8; }
+    bool Value::IsInt8() const
+    {
+        return _type.first == ValueType::Int8;
+    }
 
-    bool Value::IsInt16() const { return _type.first == ValueType::Int16; }
+    bool Value::IsInt16() const
+    {
+        return _type.first == ValueType::Int16;
+    }
 
-    bool Value::IsInt32() const { return _type.first == ValueType::Int32; }
+    bool Value::IsInt32() const
+    {
+        return _type.first == ValueType::Int32;
+    }
 
-    bool Value::IsInt64() const { return _type.first == ValueType::Int64; }
+    bool Value::IsInt64() const
+    {
+        return _type.first == ValueType::Int64;
+    }
 
-    bool Value::IsUint16() const { return _type.first == ValueType::Uint16; }
+    bool Value::IsUint16() const
+    {
+        return _type.first == ValueType::Uint16;
+    }
 
-    bool Value::IsUint32() const { return _type.first == ValueType::Uint32; }
+    bool Value::IsUint32() const
+    {
+        return _type.first == ValueType::Uint32;
+    }
 
-    bool Value::IsUint64() const { return _type.first == ValueType::Uint64; }
+    bool Value::IsUint64() const
+    {
+        return _type.first == ValueType::Uint64;
+    }
 
-    bool Value::IsIndex() const { return _type.first == ValueType::Index; }
+    bool Value::IsIndex() const
+    {
+        return _type.first == ValueType::Index;
+    }
 
     bool Value::IsFloatingPoint() const
     {
         return (_type.first == ValueType::Float16 || _type.first == ValueType::Float || _type.first == ValueType::Double || _type.first == ValueType::BFloat16);
     }
 
-    bool Value::IsFloat16() const { return _type.first == ValueType::Float16; }
+    bool Value::IsFloat16() const
+    {
+        return _type.first == ValueType::Float16;
+    }
 
-    bool Value::IsFloat32() const { return _type.first == ValueType::Float; }
+    bool Value::IsFloat32() const
+    {
+        return _type.first == ValueType::Float;
+    }
 
-    bool Value::IsDouble() const { return _type.first == ValueType::Double; }
+    bool Value::IsDouble() const
+    {
+        return _type.first == ValueType::Double;
+    }
 
-    bool Value::IsConstrained() const { return _layout.has_value(); }
+    bool Value::IsConstrained() const
+    {
+        return _layout.has_value();
+    }
 
-    const MemoryLayout& Value::GetLayout() const { return _layout.value(); }
+    const MemoryLayout& Value::GetLayout() const
+    {
+        return _layout.value();
+    }
 
-    ValueType Value::GetBaseType() const { return _type.first; }
+    ValueType Value::GetBaseType() const
+    {
+        return _type.first;
+    }
 
-    void Value::SetLayout(MemoryLayout layout) { GetContext().SetLayout(*this, layout); }
+    void Value::SetLayout(MemoryLayout layout)
+    {
+        GetContext().SetLayout(*this, layout);
+    }
 
-    void Value::ClearLayout() { _layout.reset(); }
+    void Value::ClearLayout()
+    {
+        _layout.reset();
+    }
 
-    void Value::ClearData() { _data = Emittable{ nullptr }; }
+    void Value::ClearData()
+    {
+        _data = Emittable{ nullptr };
+    }
 
-    int Value::PointerLevel() const { return _type.second; }
+    void Value::Clear()
+    {
+        ClearData();
+        ClearLayout();
+        _type = {};
+        _hasName = false;
+    }
 
-    Value::UnderlyingDataType& Value::GetUnderlyingData() { return _data; }
+    int Value::PointerLevel() const
+    {
+        return _type.second;
+    }
 
-    const Value::UnderlyingDataType& Value::GetUnderlyingData() const { return _data; }
+    Value::UnderlyingDataType& Value::GetUnderlyingData()
+    {
+        return _data;
+    }
+
+    const Value::UnderlyingDataType& Value::GetUnderlyingData() const
+    {
+        return _data;
+    }
 
     void Value::SetName(const std::string& name)
     {
@@ -346,7 +426,10 @@ namespace value
         return GetContext().GetName(*this);
     }
 
-    bool Value::HasCustomName() const { return _hasName; }
+    bool Value::HasCustomName() const
+    {
+        return _hasName;
+    }
 
     namespace detail
     {
@@ -417,6 +500,46 @@ namespace value
 
         return ValueType::Undefined;
     }
+
+    namespace
+    {
+        template <typename T, typename C>
+        constexpr bool ItemIsOneOf(T&& t, C&& c)
+        {
+            return llvm::any_of(c, [=](auto arg) { return t == arg; });
+        }
+
+    } // namespace
+
+    bool IsImplicitlyCastable(ViewAdapter v1, ViewAdapter v2)
+    {
+        auto source = v1.GetValue().GetBaseType();
+        auto target = v2.GetValue().GetBaseType();
+
+#define MAP_TARGET_TO_POSSIBLE_SOURCES(TARGET, ...) \
+    case TARGET:                                    \
+        return ItemIsOneOf(source, std::initializer_list<ValueType>{ __VA_ARGS__ })
+
+        switch (target)
+        {
+            MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Int8, ValueType::Boolean, ValueType::Byte);
+            MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Byte, ValueType::Boolean, ValueType::Int8);
+            MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Int16, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Uint16);
+            MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Uint16, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16);
+            MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Int32, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Uint32);
+            MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Uint32, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32);
+            MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Int64, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32, ValueType::Uint32, ValueType::Uint64);
+            MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Uint64, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32, ValueType::Uint32, ValueType::Int64);
+            MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Float16, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16);
+            MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::BFloat16, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16);
+            MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Float, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32, ValueType::Uint32, ValueType::Int64, ValueType::Uint64, ValueType::Float16, ValueType::BFloat16);
+            MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Double, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32, ValueType::Uint32, ValueType::Int64, ValueType::Uint64, ValueType::Float16, ValueType::BFloat16, ValueType::Float);
+            MAP_TARGET_TO_POSSIBLE_SOURCES(ValueType::Index, ValueType::Boolean, ValueType::Int8, ValueType::Byte, ValueType::Int16, ValueType::Uint16, ValueType::Int32, ValueType::Uint32, ValueType::Int64, ValueType::Uint64);
+
+        default:
+            return false;
+        }
+    }
 } // namespace value
 } // namespace accera
 
diff --git a/accera/value/src/ValueOperations.cpp b/accera/value/src/ValueOperations.cpp
index e2ef7233..e459ed03 100644
--- a/accera/value/src/ValueOperations.cpp
+++ b/accera/value/src/ValueOperations.cpp
@@ -70,15 +70,5 @@ namespace value
         return GetContext().Cast(value, type);
     }
 
-    Scalar UnsignedCast(Scalar value, ValueType type)
-    {
-        if (value.GetType() == type)
-        {
-            return value;
-        }
-
-        return GetContext().UnsignedCast(value, type);
-    }
-
 } // namespace value
 } // namespace accera
diff --git a/docs/.bumpversion.cfg b/docs/.bumpversion.cfg
index 25c610e8..8f395e6c 100644
--- a/docs/.bumpversion.cfg
+++ b/docs/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.2.6
+current_version = 1.2.7
 
 [bumpversion:glob:**/*.md]
 search = Version: v{current_version}
diff --git a/docs/Case Studies/CONTRIBUTING.md b/docs/Case Studies/CONTRIBUTING.md
index 1b4d4a12..134cf155 100644
--- a/docs/Case Studies/CONTRIBUTING.md	
+++ b/docs/Case Studies/CONTRIBUTING.md	
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 # Contributing Guide
 
diff --git a/docs/Case Studies/README.md b/docs/Case Studies/README.md
index aed5655d..51318707 100644
--- a/docs/Case Studies/README.md	
+++ b/docs/Case Studies/README.md	
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 # Accera Case Studies
 
diff --git a/docs/Install/Building_on_MacOS.md b/docs/Install/Building_on_MacOS.md
index 5d9cfd2e..a3eb90fc 100644
--- a/docs/Install/Building_on_MacOS.md
+++ b/docs/Install/Building_on_MacOS.md
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 ## Installing on MacOS
 
diff --git a/docs/Install/Building_on_Ubuntu.md b/docs/Install/Building_on_Ubuntu.md
index 9a69f660..80ca2a4e 100644
--- a/docs/Install/Building_on_Ubuntu.md
+++ b/docs/Install/Building_on_Ubuntu.md
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 ## Installing on Ubuntu
 
diff --git a/docs/Install/Building_on_Windows.md b/docs/Install/Building_on_Windows.md
index 27bd5420..e37622f3 100644
--- a/docs/Install/Building_on_Windows.md
+++ b/docs/Install/Building_on_Windows.md
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 ## Installing on Windows
 
diff --git a/docs/Install/Installing_Accera_on_MacOS.md b/docs/Install/Installing_Accera_on_MacOS.md
index 21d229d4..51fef805 100644
--- a/docs/Install/Installing_Accera_on_MacOS.md
+++ b/docs/Install/Installing_Accera_on_MacOS.md
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 ## Installing on MacOS
 
diff --git a/docs/Install/Installing_Accera_on_Ubuntu.md b/docs/Install/Installing_Accera_on_Ubuntu.md
index 4c954e39..0355e46f 100644
--- a/docs/Install/Installing_Accera_on_Ubuntu.md
+++ b/docs/Install/Installing_Accera_on_Ubuntu.md
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 ## Installing on Ubuntu
 
diff --git a/docs/Install/Installing_Accera_on_Windows.md b/docs/Install/Installing_Accera_on_Windows.md
index 68fe7885..174d520b 100644
--- a/docs/Install/Installing_Accera_on_Windows.md
+++ b/docs/Install/Installing_Accera_on_Windows.md
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 ## Installing on Windows
 
diff --git a/docs/Install/README.md b/docs/Install/README.md
index 93f3f566..b29d3864 100644
--- a/docs/Install/README.md
+++ b/docs/Install/README.md
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 # Install from PyPI
 The quickest way to get up and running is to install the pre-built Python packages:
diff --git a/docs/Manual/00 Introduction.md b/docs/Manual/00 Introduction.md
index 7a29671f..3a4272ce 100644
--- a/docs/Manual/00 Introduction.md	
+++ b/docs/Manual/00 Introduction.md	
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 # Introduction
 Accera is a framework with a Python-based Domain-specific Language (eDSL) that produces optimized compute-intensive code. Accera's primary focus is the optimization of affine and semi-affine nested for-loops for CPU and GPU targets.
diff --git a/docs/Manual/01 Arrays.md b/docs/Manual/01 Arrays.md
index 64c441e0..383467c8 100644
--- a/docs/Manual/01 Arrays.md	
+++ b/docs/Manual/01 Arrays.md	
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 # Section 1: Arrays
 Accera stores data in multi-dimensional arrays of scalar elements where all the array elements share the same primary data type (e.g., float32, int8). An array has a constant number of dimensions *d* known at compile-time (e.g., a matrix is a 2-dimensional array). Each dimension has a positive size, and the sequence of *d* sizes is called the *shape* of the array. An element of an array is referred to by a *d*-coordinate zero-based *index vector*.
diff --git a/docs/Manual/02 Simple Affine Loop Nests.md b/docs/Manual/02 Simple Affine Loop Nests.md
index 3a9b4a07..6dbf8fc3 100644
--- a/docs/Manual/02 Simple Affine Loop Nests.md	
+++ b/docs/Manual/02 Simple Affine Loop Nests.md	
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 # Section 2: Simple affine loop nests
 This section introduces *loop nests* and their different types that are provided in Accera programming model.
@@ -79,7 +79,7 @@ nest.iteration_logic(logic_fn)
 ```
 
 ## Supported operations
-The iteration logic can include the following operations (assuming `accera` was imported as `rp`):
+The iteration logic can include the following operations (assuming `accera` was imported as `acc`):
 
 ### Assignment operators
 
@@ -156,6 +156,35 @@ Comment: Accera also supports the corresponding compound-assignment operators, s
 | `acc.cosh(a)` | `acc.ScalarType.float16/32/64` | Returns the hyperbolic cosine of scalar *a*, where *a* is in radians |
 | `acc.tanh(a)` | `acc.ScalarType.float16/32/64` | Returns the hyperbolic tangent of scalar *a*, where *a* is in radians |
 
+### Implicit type casting
+
+Accera operators require operands to be the same type. Computations that use multiple types can take advantage of Accera's implicit type casting support when converting from smaller-sized types to larger-sized types.
+
+To do implicit casting, simply assign a source type to its implicitly-castable destination type. No additional casting operation is needed for converting between these types.
+
+| Source types | Destination type (implicitly-castable) |
+| ------------ | -------------------------------------- |
+| `acc.ScalarType.bool`, `acc.ScalarType.uint8` | `acc.ScalarType.int8` |
+| `acc.ScalarType.bool`,  `acc.ScalarType.int8` | `acc.ScalarType.uint8` |
+| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.uint16` | `acc.ScalarType.int16` |
+| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.int16` | `acc.ScalarType.uint16` |
+| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.int16`, `acc.ScalarType.uint16`, `acc.ScalarType.uint32` | `acc.ScalarType.int32` |
+| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.int16`, `acc.ScalarType.uint16`, `acc.ScalarType.int32` | `acc.ScalarType.uint32` |
+| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.int16`, `acc.ScalarType.uint16`, `acc.ScalarType.int32`, `acc.ScalarType.uint32`, `acc.ScalarType.uint64` | `acc.ScalarType.int64` |
+| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.int16`, `acc.ScalarType.uint16`, `acc.ScalarType.int32`, `acc.ScalarType.uint32`, `acc.ScalarType.int64` | `acc.ScalarType.uint64` |
+| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.int16`, `acc.ScalarType.uint16` | `acc.ScalarType.float16` |
+| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.int16`, `acc.ScalarType.uint16` | `acc.ScalarType.bfloat16` |
+| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.int16`, `acc.ScalarType.uint16`, `acc.ScalarType.int32`, `acc.ScalarType.uint32`, `acc.ScalarType.int64`, `acc.ScalarType.float16`, `acc.ScalarType.bfloat16` | `acc.ScalarType.float32` |
+| `acc.ScalarType.bool`, `acc.ScalarType.int8`, `acc.ScalarType.uint8`, `acc.ScalarType.int16`, `acc.ScalarType.uint16`, `acc.ScalarType.int32`, `acc.ScalarType.uint32`, `acc.ScalarType.int64`, `acc.ScalarType.float16`, `acc.ScalarType.bfloat16`, `acc.ScalarType.float32` | `acc.ScalarType.float64` | 
+
+[comment]: # (bool, int8, uint8, int16, uint16, int32, uint32, int64, uint64 | index)
+
+To override the casting behavior above, or cast a larger-sized type to a smaller-sized type, use the `acc.cast` operation.
+
+Comment: implicit casting of constants may result in truncation.
+
+[comment]: # (MISSING: examples for constant implicit casting that cause unexpected truncation)
+
 ## Accera program stages
 Let’s take a step back to describe the stages of Accera program:
 
diff --git a/docs/Manual/03 Schedules.md b/docs/Manual/03 Schedules.md
index 58d8ba95..876ccaa4 100644
--- a/docs/Manual/03 Schedules.md	
+++ b/docs/Manual/03 Schedules.md	
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 # Section 3: Schedules
 We begin with `nest` from [Section 2](<02%20Simple%20Affine%20Loop%20Nests.md>) which captures the logic of matrix-matrix multiplication. We use `nest` to create a `Schedule` that controls the execution order of the nest's iterations. Schedules are target-independent in the sense that the same schedule can be used to emit code for multiple target platforms.
diff --git a/docs/Manual/04 Fusing.md b/docs/Manual/04 Fusing.md
index f363c024..c0795091 100644
--- a/docs/Manual/04 Fusing.md	
+++ b/docs/Manual/04 Fusing.md	
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 # Section 4: Fusing
 With `fuse` operation, multiple schedules can be combined into a single schedule representing the union of the work in the original schedules. These fused schedules can be transformed by any of the transformations presented in [Section 3](<03%20Schedules.md>).
diff --git a/docs/Manual/05 Targets.md b/docs/Manual/05 Targets.md
index 14cacaa2..7d305691 100644
--- a/docs/Manual/05 Targets.md	
+++ b/docs/Manual/05 Targets.md	
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 # Section 5: Targets
 Accera is a cross compiler, which means that it can generate executable code for different target platforms. A target is described using the `Target` class. Accera already supports many different targets, for example:
diff --git a/docs/Manual/06 Plans - Caching.md b/docs/Manual/06 Plans - Caching.md
index 4a589825..83a91d6d 100644
--- a/docs/Manual/06 Plans - Caching.md	
+++ b/docs/Manual/06 Plans - Caching.md	
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 # Section 6: Plans - Caching
 In the previous sections, we defined the logic and then scheduled its iterations. Now, let's move on to completing the implementation with target-specific options.
diff --git a/docs/Manual/07 Plans - Operations and Optimizations.md b/docs/Manual/07 Plans - Operations and Optimizations.md
index 36e91305..f3c535d1 100644
--- a/docs/Manual/07 Plans - Operations and Optimizations.md	
+++ b/docs/Manual/07 Plans - Operations and Optimizations.md	
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 # Section 7: Plans - Operations and Optimizations
 We can control target-specific operations and optimizations using a plan. Examples include instruction pipelining, applying SIMD vector instructions, and so on.
diff --git a/docs/Manual/08 Deferred Layout of Constant Arrays.md b/docs/Manual/08 Deferred Layout of Constant Arrays.md
index 5a8109bd..b8a0d7c9 100644
--- a/docs/Manual/08 Deferred Layout of Constant Arrays.md	
+++ b/docs/Manual/08 Deferred Layout of Constant Arrays.md	
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 # Section 8: Deferred layout of constant arrays
 Let's revisit the memory layout of constant arrays. As explained in [Section 1](<01%20Arrays.md>), the contents of constant arrays are known at compile-time, and these contents are immutable. Accera stores constant arrays in a non-standard memory layout optimized for a particular plan. In some cases, storing multiple copies of each array element may even prove advantageous (e.g., storing a matrix in row-major and column-major layouts).
diff --git a/docs/Manual/09 Parameters.md b/docs/Manual/09 Parameters.md
index b6b15696..fbc8f8f5 100644
--- a/docs/Manual/09 Parameters.md	
+++ b/docs/Manual/09 Parameters.md	
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 # Section 9: Parameters
 
diff --git a/docs/Manual/10 Packages.md b/docs/Manual/10 Packages.md
index b2e3c4ad..67b9a2d2 100644
--- a/docs/Manual/10 Packages.md	
+++ b/docs/Manual/10 Packages.md	
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 # Section 10: Building Packages
 The `Package` class represents a collection of Accera-generated functions. Whenever a package is built, it creates a stand-alone function library that other pieces of software can use. Currently, Accera supports two package formats: HAT and MLIR.
diff --git a/docs/Manual/README.md b/docs/Manual/README.md
index cd1c9d32..62513753 100644
--- a/docs/Manual/README.md
+++ b/docs/Manual/README.md
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 # Accera v1.2.1 Manual
 
diff --git a/docs/Reference/accera.md b/docs/Reference/accera.md
index 3bf23e6d..45eb98fc 100644
--- a/docs/Reference/accera.md
+++ b/docs/Reference/accera.md
@@ -1,9 +1,10 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 # Module functions
+* [`accera.cast`](functions/cast.md) `(value, type)`
 * [`accera.create_parameters`](functions/create_parameters.md) `(number)`
 * [`accera.create_parameter_grid`](functions/create_parameter_grid.md) `(parameter_choices, filter_func, sample)`
 * [`accera.fuse`](functions/fuse.md) `(schedules[, partial])`
diff --git a/docs/Reference/classes/Array/Array.md b/docs/Reference/classes/Array/Array.md
index 8766421f..8ab083ff 100644
--- a/docs/Reference/classes/Array/Array.md
+++ b/docs/Reference/classes/Array/Array.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Array(role[, data, element_type, layout, offset, shape])`
 Constructs an array.
diff --git a/docs/Reference/classes/Array/Layout.md b/docs/Reference/classes/Array/Layout.md
index c64c3395..c81070dc 100644
--- a/docs/Reference/classes/Array/Layout.md
+++ b/docs/Reference/classes/Array/Layout.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 ## `accera.Array.Layout`
 
 type | description
diff --git a/docs/Reference/classes/Array/Role.md b/docs/Reference/classes/Array/Role.md
index a2bea01a..0dc36f35 100644
--- a/docs/Reference/classes/Array/Role.md
+++ b/docs/Reference/classes/Array/Role.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 ## `accera.Array.Role`
 
 type | description
diff --git a/docs/Reference/classes/Array/deferred_layout.md b/docs/Reference/classes/Array/deferred_layout.md
index 4cf5452a..334e22d5 100644
--- a/docs/Reference/classes/Array/deferred_layout.md
+++ b/docs/Reference/classes/Array/deferred_layout.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Array.deferred_layout(cache)`
 Specifies the layout for a `Array.Role.CONST` array based on a `Cache`. For more details, see [Deferred layout of constant arrays](<../../../Manual/08%20Deferred%20Layout%20of%20Constant%20Arrays.md>)
diff --git a/docs/Reference/classes/Array/sub_array.md b/docs/Reference/classes/Array/sub_array.md
index 177b2b3c..ffa9fde3 100644
--- a/docs/Reference/classes/Array/sub_array.md
+++ b/docs/Reference/classes/Array/sub_array.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Array.sub_array(offsets, shape, strides)`
 Creates a sub-array of a specific shape from an array. The sub-array is created from elements at specified offsets and strides into the original array.
diff --git a/docs/Reference/classes/Nest/Nest.md b/docs/Reference/classes/Nest/Nest.md
index 464c2935..cdf885fe 100644
--- a/docs/Reference/classes/Nest/Nest.md
+++ b/docs/Reference/classes/Nest/Nest.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Nest(shape)`
 Creates an affine loop nest.
diff --git a/docs/Reference/classes/Nest/create_plan.md b/docs/Reference/classes/Nest/create_plan.md
index b7299ee4..daf3f898 100644
--- a/docs/Reference/classes/Nest/create_plan.md
+++ b/docs/Reference/classes/Nest/create_plan.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Nest.create_plan([target])`
 Creates a plan using the default schedule for the nest.
diff --git a/docs/Reference/classes/Nest/create_schedule.md b/docs/Reference/classes/Nest/create_schedule.md
index 46b5c81b..1ee42a5e 100644
--- a/docs/Reference/classes/Nest/create_schedule.md
+++ b/docs/Reference/classes/Nest/create_schedule.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Nest.create_schedule()`
 Create a default schedule for a nest.
diff --git a/docs/Reference/classes/Nest/get_indices.md b/docs/Reference/classes/Nest/get_indices.md
index f4188cc1..e7455bb5 100644
--- a/docs/Reference/classes/Nest/get_indices.md
+++ b/docs/Reference/classes/Nest/get_indices.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Nest.get_indices()`
 Gets the iteration space dimensions for a nest.
diff --git a/docs/Reference/classes/Nest/iteration_logic.md b/docs/Reference/classes/Nest/iteration_logic.md
index 33066716..07a523ac 100644
--- a/docs/Reference/classes/Nest/iteration_logic.md
+++ b/docs/Reference/classes/Nest/iteration_logic.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.60)
+[//]: # (Version: v1.2.70)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Nest.iteration_logic(logic)`
 Adds an iteration logic function to a `Nest`.
diff --git a/docs/Reference/classes/Package/Format.md b/docs/Reference/classes/Package/Format.md
index 0a8f7f60..6666aa95 100644
--- a/docs/Reference/classes/Package/Format.md
+++ b/docs/Reference/classes/Package/Format.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 ## `accera.Package.Format`
 
 type | description
diff --git a/docs/Reference/classes/Package/Mode.md b/docs/Reference/classes/Package/Mode.md
index 44dbae43..3ed99bc2 100644
--- a/docs/Reference/classes/Package/Mode.md
+++ b/docs/Reference/classes/Package/Mode.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 ## `accera.Package.Mode`
 
 type | description
diff --git a/docs/Reference/classes/Package/Package.md b/docs/Reference/classes/Package/Package.md
index faa4b3c3..c4c42939 100644
--- a/docs/Reference/classes/Package/Package.md
+++ b/docs/Reference/classes/Package/Package.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Package.Package()`
 A package of functions that can be built and linked with client code.
diff --git a/docs/Reference/classes/Package/Platform.md b/docs/Reference/classes/Package/Platform.md
index d85fa9cd..970c209d 100644
--- a/docs/Reference/classes/Package/Platform.md
+++ b/docs/Reference/classes/Package/Platform.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 ## `accera.Package.Platform`
 
 type | description
diff --git a/docs/Reference/classes/Package/add.md b/docs/Reference/classes/Package/add.md
index b0761257..2c929715 100644
--- a/docs/Reference/classes/Package/add.md
+++ b/docs/Reference/classes/Package/add.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Package.add(source, args[, base_name, parameters])`
 Adds one or more functions to the package.
diff --git a/docs/Reference/classes/Package/add_description.md b/docs/Reference/classes/Package/add_description.md
index 1d1249d1..41e25edc 100644
--- a/docs/Reference/classes/Package/add_description.md
+++ b/docs/Reference/classes/Package/add_description.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Package.add_description([author, license, other, version])`
 Adds descriptive metadata to the HAT package.
diff --git a/docs/Reference/classes/Package/build.md b/docs/Reference/classes/Package/build.md
index acf294e2..707188e9 100644
--- a/docs/Reference/classes/Package/build.md
+++ b/docs/Reference/classes/Package/build.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Package.build(name[, format, mode, platform, tolerance, output_dir])`
 Builds a HAT package.
diff --git a/docs/Reference/classes/Plan/bind.md b/docs/Reference/classes/Plan/bind.md
index 4c2a49ef..1e3de9ad 100644
--- a/docs/Reference/classes/Plan/bind.md
+++ b/docs/Reference/classes/Plan/bind.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Plan.bind(mapping)`
 Only available for targets that can execute a grid of work (such as GPUs). The `bind` function binds dimensions of the iteration space to axes of the target-specific grid (such as `v100.GridUnit.BLOCK_X`, `v100.GridUnit.THREAD_X` on an Nvidia GPU).
diff --git a/docs/Reference/classes/Plan/cache.md b/docs/Reference/classes/Plan/cache.md
index 41262435..09c2d368 100644
--- a/docs/Reference/classes/Plan/cache.md
+++ b/docs/Reference/classes/Plan/cache.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Plan.cache(source[, index, trigger_index, layout, level, trigger_level, max_elements, thrifty, location, double_buffer])`
 Adds a caching strategy to a plan.
diff --git a/docs/Reference/classes/Plan/kernelize.md b/docs/Reference/classes/Plan/kernelize.md
index db6f1c8d..4a860f74 100644
--- a/docs/Reference/classes/Plan/kernelize.md
+++ b/docs/Reference/classes/Plan/kernelize.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Plan.kernelize(unroll_indices[, vectorize_indices])`
 A convenience method for a sequence of `unroll` instructions followed by a possible sequence of `vectorize` instructions.
diff --git a/docs/Reference/classes/Plan/parallelize.md b/docs/Reference/classes/Plan/parallelize.md
index 4f07b468..83fb0e13 100644
--- a/docs/Reference/classes/Plan/parallelize.md
+++ b/docs/Reference/classes/Plan/parallelize.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Plan.parallelize(indices[, pin, policy])`
 
diff --git a/docs/Reference/classes/Plan/tensorize.md b/docs/Reference/classes/Plan/tensorize.md
index 40752110..7f4389c7 100644
--- a/docs/Reference/classes/Plan/tensorize.md
+++ b/docs/Reference/classes/Plan/tensorize.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Plan.tensorize(indices, mma_shape [, use_static_offsets, num_total_passes, num_fused_passes, scheduling_policy])`
 Only available for targets with native matrix multiplication instruction (tensor core) support. Marks the dimensions of the iteration-space for tensorization. Only perfectly nested loops of the following form can be tensorized:
diff --git a/docs/Reference/classes/Plan/unroll.md b/docs/Reference/classes/Plan/unroll.md
index 661f02cb..c1b288e7 100644
--- a/docs/Reference/classes/Plan/unroll.md
+++ b/docs/Reference/classes/Plan/unroll.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Plan.unroll(index)`
 Marks a dimension of the iteration-space for unrolling.
diff --git a/docs/Reference/classes/Plan/vectorize.md b/docs/Reference/classes/Plan/vectorize.md
index f390190b..a8ac6eeb 100644
--- a/docs/Reference/classes/Plan/vectorize.md
+++ b/docs/Reference/classes/Plan/vectorize.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Plan.vectorize(index)`
 Only available for targets that have SIMD registers and support vector instructions. Marks a dimension of the iteration-space for vectorization.
diff --git a/docs/Reference/classes/Schedule/create_plan.md b/docs/Reference/classes/Schedule/create_plan.md
index 85374f9b..75fa8f6b 100644
--- a/docs/Reference/classes/Schedule/create_plan.md
+++ b/docs/Reference/classes/Schedule/create_plan.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Schedule.create_plan([target])`
 Creates a plan for running this schedule.
diff --git a/docs/Reference/classes/Schedule/is_valid_loop_order.md b/docs/Reference/classes/Schedule/is_valid_loop_order.md
index 076d9b6e..a258a407 100644
--- a/docs/Reference/classes/Schedule/is_valid_loop_order.md
+++ b/docs/Reference/classes/Schedule/is_valid_loop_order.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Schedule.is_valid_loop_order(*order)`
 The `is_valid_loop_order` function determines if an order of indices is valid. For a description of valid schedule orders, refer to [reorder](reorder.md).
diff --git a/docs/Reference/classes/Schedule/pad.md b/docs/Reference/classes/Schedule/pad.md
index c4859a21..b7b8ce84 100644
--- a/docs/Reference/classes/Schedule/pad.md
+++ b/docs/Reference/classes/Schedule/pad.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Schedule.pad(index, size)`
 Pads the beginning of a specified dimension of the iteration-space with empty (no-op) elements.
diff --git a/docs/Reference/classes/Schedule/reorder.md b/docs/Reference/classes/Schedule/reorder.md
index 43e2bd11..243f8587 100644
--- a/docs/Reference/classes/Schedule/reorder.md
+++ b/docs/Reference/classes/Schedule/reorder.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Schedule.reorder(order, *args)`
 The `reorder` transformation sets the order of the indices in the schedule.
diff --git a/docs/Reference/classes/Schedule/skew.md b/docs/Reference/classes/Schedule/skew.md
index b8f48ddd..3114c610 100644
--- a/docs/Reference/classes/Schedule/skew.md
+++ b/docs/Reference/classes/Schedule/skew.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Schedule.skew(index, reference_index [, unroll_loops_smaller_than])`
 Transforms a dimension with respect to a reference dimension into a parallelogram by padding with empty elements.
diff --git a/docs/Reference/classes/Schedule/split.md b/docs/Reference/classes/Schedule/split.md
index 1909b862..7a316052 100644
--- a/docs/Reference/classes/Schedule/split.md
+++ b/docs/Reference/classes/Schedule/split.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Schedule.split(index, size)`
 The `split` transformation takes a dimension `i` and a `size`, modifies `i`, and creates a new dimension `ii`.
diff --git a/docs/Reference/classes/Schedule/tile.md b/docs/Reference/classes/Schedule/tile.md
index ff0d18e1..8514cf1d 100644
--- a/docs/Reference/classes/Schedule/tile.md
+++ b/docs/Reference/classes/Schedule/tile.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Schedule.tile(shape)`
 The `tile` transformation is a convenience syntax that takes a tuple of indices and a tuple of sizes, and splits each index by the corresponding size. The indices involved in the split are then ordered such that all the outer indices precede all of their respective inner indices.
diff --git a/docs/Reference/classes/Target/Architecture.md b/docs/Reference/classes/Target/Architecture.md
index 62611088..0ae50bcf 100644
--- a/docs/Reference/classes/Target/Architecture.md
+++ b/docs/Reference/classes/Target/Architecture.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 ## `accera.Target.Architecture`
 
 Defines the supported target architectures.
diff --git a/docs/Reference/classes/Target/Category.md b/docs/Reference/classes/Target/Category.md
index 73e995ef..4de78aa9 100644
--- a/docs/Reference/classes/Target/Category.md
+++ b/docs/Reference/classes/Target/Category.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 ## `accera.Target.Category`
 
 Defines the target processor category.
diff --git a/docs/Reference/classes/Target/Model.md b/docs/Reference/classes/Target/Model.md
index ea26c6d0..98940157 100644
--- a/docs/Reference/classes/Target/Model.md
+++ b/docs/Reference/classes/Target/Model.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 ## `accera.Target.Model`
 
 Defines constants for some well-known CPU models.
diff --git a/docs/Reference/classes/Target/Runtime.md b/docs/Reference/classes/Target/Runtime.md
index 94726176..7e004789 100644
--- a/docs/Reference/classes/Target/Runtime.md
+++ b/docs/Reference/classes/Target/Runtime.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 ## `accera.Target.Runtime`
 
 The runtime for code generation and/or compilation.
diff --git a/docs/Reference/classes/Target/Target.md b/docs/Reference/classes/Target/Target.md
index 7ae616a3..53f18231 100644
--- a/docs/Reference/classes/Target/Target.md
+++ b/docs/Reference/classes/Target/Target.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.60)
+[//]: # (Version: v1.2.70)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.Target([architecture, cache_lines, cache_sizes, category, extensions, family, frequency_GHz, known_name, model, name, num_cores, num_threads, runtime, tensor_core_info, turbo_frequency_GHz, vector_bytes, vector_registers)`
 
diff --git a/docs/Reference/enumerations/MMASchedulingPolicy.md b/docs/Reference/enumerations/MMASchedulingPolicy.md
index 6f42ba32..81c1bc2f 100644
--- a/docs/Reference/enumerations/MMASchedulingPolicy.md
+++ b/docs/Reference/enumerations/MMASchedulingPolicy.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 ## `accera.MMASchedulingPolicy`
 
 type | description
diff --git a/docs/Reference/enumerations/MMAShape.md b/docs/Reference/enumerations/MMAShape.md
index e792175f..8468d2a0 100644
--- a/docs/Reference/enumerations/MMAShape.md
+++ b/docs/Reference/enumerations/MMAShape.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 ## `accera.MMAShape`
 
 The following table shows the matrix multiplication parameters associated with the different enum values, for different data types for a single pass. So for example a single pass of the `M32xN32xK2_B1` operation would take input matrices of dimensions [32x2] (A) and [2x32] (B) to produce a matrix multiplication result of dimensions [32x32] (C). These operations can then be composed together to perform matrix multiplication of larger matrices.
diff --git a/docs/Reference/enumerations/ScalarType.md b/docs/Reference/enumerations/ScalarType.md
index 881b0d77..f7dccd0a 100644
--- a/docs/Reference/enumerations/ScalarType.md
+++ b/docs/Reference/enumerations/ScalarType.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 ## `accera.ScalarType`
 
 type | description
diff --git a/docs/Reference/functions/cast.md b/docs/Reference/functions/cast.md
new file mode 100644
index 00000000..a51c2a43
--- /dev/null
+++ b/docs/Reference/functions/cast.md
@@ -0,0 +1,77 @@
+[//]: # (Project: Accera)
+[//]: # (Version: v1.2.7)
+
+# Accera v1.2.7 Reference
+
+## `accera.cast(value, type)`
+The `cast` operation converts a value from one `acc.ScalarType` to another.
+
+Accera performs implicit casting between most types. Therefore, this operation should only be used to override the implicit casting behavior documented in [Section 2](<../../Manual/02%20Simple%20Affine%20Loop%20Nests.md>).
+
+Limitation: casting constants may result in truncation.
+
+[comment]: # (MISSING: examples for constant casting that cause unexpected truncation)
+
+
+## Arguments
+
+argument | description | type/default
+--- | --- | ---
+`value` | The value to cast |
+`type` | The destination type | `acc.ScalarType`
+
+## Returns
+The result after casting
+
+## Examples
+
+Casting from float32 to int16:
+
+```python
+A = acc.Array(role=acc.Array.Role.INPUT, element_type=acc.ScalarType.float32, shape=(10, 20))
+B = acc.Array(role=acc.Array.Role.INPUT_OUTPUT, element_type=acc.ScalarType.int16, shape=(10, 20))
+
+nest = acc.Nest(10, 20)
+i, j = nest.get_indices()
+
+@nest.iteration_logic:
+def _():
+    B[i, j] = acc.cast(A[i, j], acc.ScalarType.int16) # explicit cast to int16
+    ...
+```
+
+In comparison, casting from int16 to float32 is implicit, which means the `cast` operation can be omitted:
+
+```python
+A = acc.Array(role=acc.Array.Role.INPUT, element_type=acc.ScalarType.int16, shape=(10, 20))
+B = acc.Array(role=acc.Array.Role.INPUT_OUTPUT, element_type=acc.ScalarType.float32, shape=(10, 20))
+
+nest = acc.Nest(10, 20)
+i, j = nest.get_indices()
+
+@nest.iteration_logic:
+def _():
+    B[i, j] = A[i, j] # implicit cast to float32
+    ...
+```
+
+Casting a constant to int8:
+
+```python
+
+A = acc.Array(role=acc.Array.Role.INPUT_OUTPUT, element_type=acc.ScalarType.int8, shape=(10, 20))
+
+nest = acc.Nest(10, 20)
+i, j = nest.get_indices()
+
+@nest.iteration_logic:
+def _():
+    A[i, j] = acc.cast(10, acc.ScalarType.int8)
+    ...
+
+```
+
+
+<div style="page-break-after: always;"></div>
+
+
diff --git a/docs/Reference/functions/create_parameter_grid.md b/docs/Reference/functions/create_parameter_grid.md
index d3d0dedd..9c61b1ad 100644
--- a/docs/Reference/functions/create_parameter_grid.md
+++ b/docs/Reference/functions/create_parameter_grid.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.create_parameter_grid(parameter_choices, filter_func, sample, seed)`
 Create a parameter grid from a dictionary that maps each parameter to its possible values.
diff --git a/docs/Reference/functions/create_parameters.md b/docs/Reference/functions/create_parameters.md
index 9fd46de7..8ffc9e82 100644
--- a/docs/Reference/functions/create_parameters.md
+++ b/docs/Reference/functions/create_parameters.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.create_parameters()`
 Creates placeholder parameters.
diff --git a/docs/Reference/functions/fuse.md b/docs/Reference/functions/fuse.md
index ca7880fd..090d3bf4 100644
--- a/docs/Reference/functions/fuse.md
+++ b/docs/Reference/functions/fuse.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 ## `accera.fuse(schedules[, *args, partial])`
 The `fuse` operation combines multiple iteration spaces into a single "fused" iteration space. The fused iteration space represents the union of the work in the original spaces.
diff --git a/docs/Reference/safety_analysis.md b/docs/Reference/safety_analysis.md
index ed2c91de..1ac43468 100644
--- a/docs/Reference/safety_analysis.md
+++ b/docs/Reference/safety_analysis.md
@@ -1,7 +1,7 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
-# Accera v1.2.6 Reference
+# Accera v1.2.7 Reference
 
 # Safety Analysis
 
diff --git a/docs/Tutorials/Hello_MatMul.md b/docs/Tutorials/Hello_MatMul.md
index 20645bbf..67d99a94 100644
--- a/docs/Tutorials/Hello_MatMul.md
+++ b/docs/Tutorials/Hello_MatMul.md
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 ## Hello MatMul
 
diff --git a/docs/Tutorials/Hello_MatMul_GPU.md b/docs/Tutorials/Hello_MatMul_GPU.md
index a0e47136..675db836 100644
--- a/docs/Tutorials/Hello_MatMul_GPU.md
+++ b/docs/Tutorials/Hello_MatMul_GPU.md
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 ## Hello MatMul GPU
 
diff --git a/docs/Tutorials/Optimized_MatMul.md b/docs/Tutorials/Optimized_MatMul.md
index c29c3b88..301c01b1 100644
--- a/docs/Tutorials/Optimized_MatMul.md
+++ b/docs/Tutorials/Optimized_MatMul.md
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 ## Optimized MatMul
 
diff --git a/docs/Tutorials/Pi3_Cross_Compilation.md b/docs/Tutorials/Pi3_Cross_Compilation.md
index 50274887..b53f60e8 100644
--- a/docs/Tutorials/Pi3_Cross_Compilation.md
+++ b/docs/Tutorials/Pi3_Cross_Compilation.md
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 # Cross Compiling for the Raspberry Pi 3
 
diff --git a/docs/Tutorials/README.md b/docs/Tutorials/README.md
index b21939e2..817bec04 100644
--- a/docs/Tutorials/README.md
+++ b/docs/Tutorials/README.md
@@ -1,5 +1,5 @@
 [//]: # (Project: Accera)
-[//]: # (Version: v1.2.6)
+[//]: # (Version: v1.2.7)
 
 # Accera Tutorials
 
diff --git a/tools/benchmarkers/gpu_benchmark_tool.py b/tools/benchmarkers/gpu_benchmark_tool.py
index 44cb910e..bb1690d0 100755
--- a/tools/benchmarkers/gpu_benchmark_tool.py
+++ b/tools/benchmarkers/gpu_benchmark_tool.py
@@ -210,7 +210,7 @@ def prepare_system_for_benchmark(target, available_gpus):
 def main(args=[]):
     parser = argparse.ArgumentParser()
     parser.add_argument('-d', '--devices', help='The devices to use for the benchmark', required=False, default="0,1,2,3")
-    parser.add_argument('-i', '--input', help='The input config file (csv)', required=False)
+    parser.add_argument('-i', '--input', help='Comma-separated list of input config files (csv)', required=False)
     parser.add_argument('-y', '--type', help='The data type for the input set, h or fp16, s for fp32', required=False)
     parser.add_argument('-b', '--branch', help='The git branch to use to tag the results to', required=False)
     parser.add_argument('-z', '--string', help='input config string (csv, semi-colon per row)', required=False)
@@ -241,15 +241,18 @@ def main(args=[]):
         args.string = ','.join(gemm_opts.CONFIG_HEADERS) + '\n' + '\n'.join(args.string.split(';'))
         f = StringIO(args.string)
 
-    try:
-        if f is None:
-            f = open(args.input)
+    gemm_inputs = []
+    if f is None:
+        input_files = args.input.split(",")
+        for file in input_files:
+            try:
+                f = open(file)
 
-        reader = csv.DictReader(f, gemm_opts.CONFIG_HEADERS)
-        gemm_inputs = [gemm_opts.GemmOpts(**data) for data in islice(reader, 1, None)]
+                reader = csv.DictReader(f, gemm_opts.CONFIG_HEADERS)
+                gemm_inputs += [gemm_opts.GemmOpts(**data) for data in islice(reader, 1, None)]
 
-    finally:
-        f.close()
+            finally:
+                f.close()
 
     available_gpus = []
     devices = args.devices.split(",")